Model

library(DiagrammeR) #⋉
DiagrammeR::grViz("
digraph causal {

  # Nodes
  node [shape = plaintext]
  X [label = 'Observed\nConfounders\n(X)',fontsize=10]
  Z [label = 'Residential\nPrograms\n(Z)',fontsize=10]
  Y [label = 'Early Drop-out\n(Y)',fontsize=10]
  U [label = 'Unobserved\nConfounders\n(U)',fontsize=10]

# Nodes
 node [shape = box]
  S [label = 'Matched\n(S=1)',fontsize=7]
  C [label = 'Not censored\n(C=0)',fontsize=7]
  # Edges
  edge [color = black,
        arrowhead = vee]
  rankdir = LR
  
  Z-> Y [minlen=2]
  X -> Z [minlen=2]
  X -> Y 

  U -> Z 
  U -> Y 
  
  X -> S #[minlen=1]
  Z -> S #[minlen=1]
  
  X -> C #[minlen=3]
  Z -> C #[minlen=3]
#{ rank = same; X; Y }
{ rank = same; S; C }
{ rank = same; X; U }

  # Graph
  graph [overlap = true]
}")

Figure 1. Directed Acyclic Graph

#  {rank=same ; A -> B -> C -> D};
#       {rank=same ;           F -> E[dir=back]};
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3733703/
#Cohort matching on a variable associated with both outcome and censoring
#Cohort matching on a confounder. We let A denote an exposure, Y denote an outcome, and C denote a confounder and matching variable. The variable S indicates whether an individual in the source population is selected for the matched study (1: selected, 0: not selected). See Section 2-7 for details.
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7064555/

Balance

We selected the cases that had completed treatments, leaving ras.numeric(sum(sel_treat_tabyl$n))%>% formatC(big.mark=“,”)observations. Must take note that we did not excluded those that did not have a status of a finished treatment (n=r tot_sin_motivo_egreso %>% nrow() %>% formatC(big.mark=",")), or had more than one finished treatment previous to the selected (n= 2.238e+04).


We selected the following variables of interest:

  • “Starting Substance” (sus_ini_mvv)
  • “Marital Status” (estado_conyugal_2)
  • “Educational Attainment” (escolaridad_rec)
  • “Occupational Status” (estatus_ocupacional_rec)
  • “Age of Onset of Drug Use” (edad_ini_cons)
  • “Frequency of use of primary drug” (freq_cons_sus_prin)
  • “Motive of Admission to Treatment” (origen_ingreso_mod)
  • “Psychiatric comorbidity” (dg_cie_10_rec)
  • “Chilean Region of the Center” (nombre_region)
  • “Type of Center (Public)” (tipo_centro_pub)
  • “Early Dropout” (abandono_temprano_rec) (Y)
  • “Residential Type of Plan” (tipo_de_plan_res) (Z)
  • “Evaluation of the Therapeutic Process (Min. Achievement)” (*) (evaluacindelprocesoteraputico_min)
  • “No. of Previous Treatments” (prev_treat)
  • “Sex” (sexo_2)
  • “Age at Admission to Treatment” (edad_al_ing)
  • “Date of Admission to Treatment” (fech_ing_num)


We generated a correlation plot to get an overview of heterogeneous correlations between the different variables.


require(polycor)
#Corresponde a la apreciación clínica que hace el equipo o profesional tratante, la persona en tratamiento y su familia, del nivel alcanzado de logro de los objetivos terapéuticos planteados al inicio del proceso y descritos en el plan de tratamiento personalizado. Los criterios incluyen la evaluación del estado clínico y psicosocial al momento del egreso y una apreciación pronostica del equipo tratante.

match.on_tot <- c("row", "hash_key","sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","estatus_ocupacional_rec","edad_ini_cons","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","nombre_region","tipo_centro_pub","abandono_temprano_rec","tipo_de_plan_res","evaluacindelprocesoteraputico_min","prev_treat","sexo_2","edad_al_ing","fech_ing_num")

#CONS_C1_df_dup_SEP_2020 %>% janitor::tabyl(evaluacindelprocesoteraputico)

CONS_C1_df_dup_SEP_2020_match<-
  CONS_C1_df_dup_SEP_2020 %>% 
#:#:#:#:#:#:#:#:
  #sacar tratamientos truncados
  dplyr::filter(as.character(motivodeegreso_mod_imp)!="Ongoing treatment"|is.na(motivodeegreso_mod_imp)) %>% 
#:#:#:#:#:#:#:#:
  #dplyr::mutate(tipo_de_plan_ambulatorio=ifelse(grepl("-PA",as.character(tipo_de_plan_2)),TRUE,FALSE)) %>% 
  dplyr::mutate(tipo_de_plan_res=if_else(as.character(tipo_de_plan_2) %in% c("M-PR","PG-PR"),TRUE,FALSE,NA)) %>% 
  dplyr::mutate(abandono_temprano_rec=ifelse(as.character(abandono_temprano)=="Menos de 90 días",TRUE,FALSE)) %>% 
  dplyr::mutate(evaluacindelprocesoteraputico_min=ifelse(as.character(evaluacindelprocesoteraputico)=="3-Logro Minimo",TRUE,FALSE)) %>% 
  dplyr::mutate(tipo_centro_pub=if_else(as.character(tipo_centro)=="Public",TRUE,FALSE,NA)) %>% 
  dplyr::mutate(prev_treat=factor(dplyr::case_when(dup>=3~"2 or more",
                                                   TRUE~as.character(dup-1)))) %>% 
  dplyr::select_(.dots = match.on_tot) %>% 
  data.table::data.table()
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#CONS_C1_df_dup_SEP_2020_match %>% janitor::tabyl(prev_treat)
#CONS_C1_df_dup_SEP_2020 %>% janitor::tabyl(motivodeegreso_mod_imp)

#monthly_dataset_comp_wise_dt<-monthly_dataset_comp_wise%>% dplyr::select_if(is.numeric)%>% dplyr::select(-stringency_ind_oxf,-owid_gbd_year)%>% dplyr::select(-ends_with("mth")) %>% data.frame()

#Computes a heterogenous correlation matrix, consisting of Pearson product-moment correlations between numeric variables, polyserial correlations between numeric and ordinal variables, and polychoric correlations between 
tiempo_antes_hetcor<-Sys.time()
hetcor_mat<-hetcor(CONS_C1_df_dup_SEP_2020_match[,-c("hash_key","row")], ML = T, std.err =T, use="pairwise.complete.obs", bins=3, pd=TRUE)
tiempo_despues_hetcor<-Sys.time()
tiempo_hetcor<-tiempo_despues_hetcor-tiempo_antes_hetcor
#umx_hetcor_mat<-umx::umxHetCor(CONS_C1_df_dup_SEP_2020_match[,-c("hash_key","row")], ML = T, use = c("pairwise.complete.obs"), treatAllAsFactor = FALSE, verbose = F)

#ttr(umx_hetcor_mat,"dimnames")[[2]][1]<-"Starting Substance"
#attr(umx_hetcor_mat,"dimnames")[[2]][2]<-"Marital Status"
#attr(umx_hetcor_mat,"dimnames")[[2]][3]<-"Educational Attainment"
#attr(umx_hetcor_mat,"dimnames")[[2]][4]<-"Occupational Status"
#attr(umx_hetcor_mat,"dimnames")[[2]][5]<-"Age of Onset of Drug Use"
#attr(umx_hetcor_mat,"dimnames")[[2]][6]<-"Frequency of use of primary drug"
#attr(umx_hetcor_mat,"dimnames")[[2]][7]<-"Motive of Admission to Treatment"
#attr(umx_hetcor_mat,"dimnames")[[2]][8]<-"Psychiatric comorbidity"
#attr(umx_hetcor_mat,"dimnames")[[2]][9]<-"Chilean Region of the Center"
#attr(umx_hetcor_mat,"dimnames")[[2]][10]<-"Type of Center (Public)"
#attr(umx_hetcor_mat,"dimnames")[[2]][11]<-"Early Dropout"
#attr(umx_hetcor_mat,"dimnames")[[2]][12]<-"Residential Type of Plan"
#attr(umx_hetcor_mat,"dimnames")[[2]][13]<-"Evaluation of the Therapeutic Process (Min. Achievement)"
#attr(umx_hetcor_mat,"dimnames")[[2]][14]<-"No. of Previous Treatments"
#attr(umx_hetcor_mat,"dimnames")[[2]][15]<-"Sex"
#attr(umx_hetcor_mat,"dimnames")[[2]][16]<-"Age at Admission"

#attr(umx_hetcor_mat,"dimnames")[[1]][1]<-"Starting Substance"
#attr(umx_hetcor_mat,"dimnames")[[1]][2]<-"Marital Status"
#attr(umx_hetcor_mat,"dimnames")[[1]][3]<-"Educational Attainment"
#attr(umx_hetcor_mat,"dimnames")[[1]][4]<-"Occupational Status"
#attr(umx_hetcor_mat,"dimnames")[[1]][5]<-"Age of Onset of Drug Use"
#attr(umx_hetcor_mat,"dimnames")[[1]][6]<-"Frequency of use of primary drug"
#attr(umx_hetcor_mat,"dimnames")[[1]][7]<-"Motive of Admission to Treatment"
#attr(umx_hetcor_mat,"dimnames")[[1]][8]<-"Psychiatric comorbidity"
#attr(umx_hetcor_mat,"dimnames")[[1]][9]<-"Chilean Region of the Center"
#attr(umx_hetcor_mat,"dimnames")[[1]][10]<-"Type of Center (Public)"
#attr(umx_hetcor_mat,"dimnames")[[1]][11]<-"Early Dropout"
#attr(umx_hetcor_mat,"dimnames")[[1]][12]<-"Residential Type of Plan"
#attr(umx_hetcor_mat,"dimnames")[[1]][13]<-"Evaluation of the Therapeutic Process (Min. Achievement)"
#attr(umx_hetcor_mat,"dimnames")[[1]][14]<-"No. of Previous Treatments"
#attr(umx_hetcor_mat,"dimnames")[[1]][15]<-"Sex"
#attr(umx_hetcor_mat,"dimnames")[[1]][16]<-"Age at Admission"

attr(hetcor_mat$correlations,"dimnames")[[2]][1]<-"Starting Substance"
attr(hetcor_mat$correlations,"dimnames")[[2]][2]<-"Marital Status"
attr(hetcor_mat$correlations,"dimnames")[[2]][3]<-"Educational Attainment"
attr(hetcor_mat$correlations,"dimnames")[[2]][4]<-"Occupational Status"
attr(hetcor_mat$correlations,"dimnames")[[2]][5]<-"Age of Onset of Drug Use"
attr(hetcor_mat$correlations,"dimnames")[[2]][6]<-"Frequency of use of primary drug"
attr(hetcor_mat$correlations,"dimnames")[[2]][7]<-"Motive of Admission to Treatment"
attr(hetcor_mat$correlations,"dimnames")[[2]][8]<-"Psychiatric comorbidity"
attr(hetcor_mat$correlations,"dimnames")[[2]][9]<-"Chilean Region of the Center"
attr(hetcor_mat$correlations,"dimnames")[[2]][10]<-"Type of Center (Public)"
attr(hetcor_mat$correlations,"dimnames")[[2]][11]<-"Early Dropout"
attr(hetcor_mat$correlations,"dimnames")[[2]][12]<-"Residential Type of Plan"
attr(hetcor_mat$correlations,"dimnames")[[2]][13]<-"Evaluation of the Therapeutic Process (Min. Achievement)"
attr(hetcor_mat$correlations,"dimnames")[[2]][14]<-"No. of Previous Treatments"
attr(hetcor_mat$correlations,"dimnames")[[2]][15]<-"Sex"
attr(hetcor_mat$correlations,"dimnames")[[2]][16]<-"Age at Admission"
attr(hetcor_mat$correlations,"dimnames")[[2]][17]<-"Date of Admission"

attr(hetcor_mat$correlations,"dimnames")[[1]][1]<-"Starting Substance"
attr(hetcor_mat$correlations,"dimnames")[[1]][2]<-"Marital Status"
attr(hetcor_mat$correlations,"dimnames")[[1]][3]<-"Educational Attainment"
attr(hetcor_mat$correlations,"dimnames")[[1]][4]<-"Occupational Status"
attr(hetcor_mat$correlations,"dimnames")[[1]][5]<-"Age of Onset of Drug Use"
attr(hetcor_mat$correlations,"dimnames")[[1]][6]<-"Frequency of use of primary drug"
attr(hetcor_mat$correlations,"dimnames")[[1]][7]<-"Motive of Admission to Treatment"
attr(hetcor_mat$correlations,"dimnames")[[1]][8]<-"Psychiatric comorbidity"
attr(hetcor_mat$correlations,"dimnames")[[1]][9]<-"Chilean Region of the Center"
attr(hetcor_mat$correlations,"dimnames")[[1]][10]<-"Type of Center (Public)"
attr(hetcor_mat$correlations,"dimnames")[[1]][11]<-"Early Dropout"
attr(hetcor_mat$correlations,"dimnames")[[1]][12]<-"Residential Type of Plan"
attr(hetcor_mat$correlations,"dimnames")[[1]][13]<-"Evaluation of the Therapeutic Process (Min. Achievement)"
attr(hetcor_mat$correlations,"dimnames")[[1]][14]<-"No. of Previous Treatments"
attr(hetcor_mat$correlations,"dimnames")[[1]][15]<-"Sex"
attr(hetcor_mat$correlations,"dimnames")[[1]][16]<-"Age at Admission"
attr(hetcor_mat$correlations,"dimnames")[[1]][17]<-"Date of Admission"

attr(hetcor_mat$tests,"dimnames")[[2]][1]<-"Starting Substance"
attr(hetcor_mat$tests,"dimnames")[[2]][2]<-"Marital Status"
attr(hetcor_mat$tests,"dimnames")[[2]][3]<-"Educational Attainment"
attr(hetcor_mat$tests,"dimnames")[[2]][4]<-"Occupational Status"
attr(hetcor_mat$tests,"dimnames")[[2]][5]<-"Age of Onset of Drug Use"
attr(hetcor_mat$tests,"dimnames")[[2]][6]<-"Frequency of use of primary drug"
attr(hetcor_mat$tests,"dimnames")[[2]][7]<-"Motive of Admission to Treatment"
attr(hetcor_mat$tests,"dimnames")[[2]][8]<-"Psychiatric comorbidity"
attr(hetcor_mat$tests,"dimnames")[[2]][9]<-"Chilean Region of the Center"
attr(hetcor_mat$tests,"dimnames")[[2]][10]<-"Type of Center (Public)"
attr(hetcor_mat$tests,"dimnames")[[2]][11]<-"Early Dropout"
attr(hetcor_mat$tests,"dimnames")[[2]][12]<-"Residential Type of Plan"
attr(hetcor_mat$tests,"dimnames")[[2]][13]<-"Evaluation of the Therapeutic Process (Min. Achievement)"
attr(hetcor_mat$tests,"dimnames")[[2]][14]<-"No. of Previous Treatments"
attr(hetcor_mat$tests,"dimnames")[[2]][15]<-"Sex"
attr(hetcor_mat$tests,"dimnames")[[2]][16]<-"Age at Admission"
attr(hetcor_mat$tests,"dimnames")[[2]][17]<-"Date of Admission"

attr(hetcor_mat$tests,"dimnames")[[1]][1]<-"Starting Substance"
attr(hetcor_mat$tests,"dimnames")[[1]][2]<-"Marital Status"
attr(hetcor_mat$tests,"dimnames")[[1]][3]<-"Educational Attainment"
attr(hetcor_mat$tests,"dimnames")[[1]][4]<-"Occupational Status"
attr(hetcor_mat$tests,"dimnames")[[1]][5]<-"Age of Onset of Drug Use"
attr(hetcor_mat$tests,"dimnames")[[1]][6]<-"Frequency of use of primary drug"
attr(hetcor_mat$tests,"dimnames")[[1]][7]<-"Motive of Admission to Treatment"
attr(hetcor_mat$tests,"dimnames")[[1]][8]<-"Psychiatric comorbidity"
attr(hetcor_mat$tests,"dimnames")[[1]][9]<-"Chilean Region of the Center"
attr(hetcor_mat$tests,"dimnames")[[1]][10]<-"Type of Center (Public)"
attr(hetcor_mat$tests,"dimnames")[[1]][11]<-"Early Dropout"
attr(hetcor_mat$tests,"dimnames")[[1]][12]<-"Residential Type of Plan"
attr(hetcor_mat$tests,"dimnames")[[1]][13]<-"Evaluation of the Therapeutic Process (Min. Achievement)"
attr(hetcor_mat$tests,"dimnames")[[1]][14]<-"No. of Previous Treatments"
attr(hetcor_mat$tests,"dimnames")[[1]][15]<-"Sex"
attr(hetcor_mat$tests,"dimnames")[[1]][16]<-"Age at Admission"
attr(hetcor_mat$tests,"dimnames")[[1]][17]<-"Date of Admission"

hetcor_mat$tests[is.na(hetcor_mat$tests)]<-1

ggcorrplot<-
ggcorrplot::ggcorrplot(hetcor_mat$correlations,
           ggtheme = ggplot2::theme_void,
           insig = "blank",
           pch=1,
           pch.cex=3,
           tl.srt = 45, 
           #pch="ns",
            p.mat = hetcor_mat$tests, #  replacement has 144 rows, data has 169
            #type = "lower",
           colors = c("#6D9EC1", "white", "#E46726"), 
           tl.cex=8,
           lab=F)+
  #scale_x_discrete(labels = var_lbls_p345, drop = F) +
  #scale_y_discrete(labels = var_lbls_p345, drop = F) +
  theme(axis.text.x = element_blank())+
  #theme(axis.text.y = element_text(size=7.5,color ="black", hjust = 1))+
  theme(axis.text.y = element_blank())+
  theme(legend.position="bottom")

#umx_ggcorrplot<-
#ggcorrplot::ggcorrplot(umx_hetcor_mat,
#           ggtheme = ggplot2::theme_void,
#           insig = "blank",
#           #pch=1,
#           pch.cex=3,
#           tl.srt = 45, 
#           pch="ns",
#          p.mat = hetcor_mat$tests, #  replacement has 144 rows, data has 169
#            #type = "lower",
#           colors = c("#6D9EC1", "white", "#E46726"), 
#           tl.cex=8,
#           lab=F)+
#  #scale_x_discrete(labels = var_lbls_p345, drop = F) +
#  #scale_y_discrete(labels = var_lbls_p345, drop = F) +
#  theme(axis.text.x = element_blank())+
#  #theme(axis.text.y = element_text(size=7.5,color ="black", hjust = 1))+
#  theme(axis.text.y = element_blank())+
#  theme(legend.position="bottom")

ggplotly(ggcorrplot, height = 800, width=800)%>% 
  layout(xaxis= list(showticklabels = FALSE)) %>% 
 layout(annotations = 
 list(x = .1, y = -0.031, text = "", 
      showarrow = F, xref='paper', yref='paper', 
      #xanchor='center', yanchor='auto', xshift=0, yshift=-0,
      font=list(size=11, color="darkblue"))
 )

Figure 1. Heterogeneous Correlation Matrix of Variables of Interest


Imputation


We generated a plot to see all the missing values in the sample.


#<div style="border: 1px solid #ddd; padding: 5px; overflow-y: scroll; height:400px; overflow-x: scroll; width:100%">

missing.values<-
CONS_C1_df_dup_SEP_2020_match %>%
  rowwise %>%
  dplyr::mutate_at(.vars = vars(sus_ini_mod_mvv:fech_ing_num),
                   .funs = ~ifelse(is.na(.), 1, 0)) %>% 
  dplyr::ungroup() %>% 
  dplyr::summarise_at(vars(sus_ini_mod_mvv:fech_ing_num),~sum(.))

plot_miss<-
missing.values %>%
  melt() %>% 
  dplyr::mutate(perc= value/sum(sel_treat_tabyl[,2])) %>% 
  dplyr::mutate(label_text= paste0("Variable= ",variable,"<br>n= ",value,"<br>",scales::percent(round(perc,3)))) %>%
  dplyr::mutate(perc=perc*100) %>% 
  ggplot() +
  geom_bar(aes(x=factor(variable), y=perc,label= label_text), stat = 'identity') +
  sjPlot::theme_sjplot()+
#  scale_y_continuous(limits=c(0,1), labels=percent)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=9))+
  labs(x=NULL, y="% of Missing Values", caption=paste0("Nota. Porcentaje de perdidos del total (",sum(sel_treat_tabyl[,2]),")"))
  ggplotly(plot_miss, tooltip = c("label_text"))%>% layout(xaxis= list(showticklabels = T), height = 600, width=800) %>%   layout(yaxis = list(tickformat='%',  range = c(0, 7)))

Figure 2. Bar plot of Porcentaje of Missing Values per Variables

  #</div>


From the figure above, we could see that the starting substance (sus_ini_mvv) and the onset of drug use (edad_ini_cons) had around 6% of missing data. These values should be imputed. We first focused on the age of onset of drug use.


label(CONS_C1_df_dup_SEP_2020_match$sus_ini_mod_mvv)<-"Starting Substance"
label(CONS_C1_df_dup_SEP_2020_match$estado_conyugal_2)<-"Marital Status"
label(CONS_C1_df_dup_SEP_2020_match$escolaridad_rec)<-"Educational Attainment"
label(CONS_C1_df_dup_SEP_2020_match$estatus_ocupacional_rec)<-"Occupational Status"
label(CONS_C1_df_dup_SEP_2020_match$edad_ini_cons)<-"Age of Onset of Drug Use"
label(CONS_C1_df_dup_SEP_2020_match$freq_cons_sus_prin)<-"Frequency of use of primary drug"
label(CONS_C1_df_dup_SEP_2020_match$origen_ingreso_mod)<-"Motive of Admission to Treatment"
label(CONS_C1_df_dup_SEP_2020_match$dg_cie_10_rec)<-"Psychiatric comorbidity"
label(CONS_C1_df_dup_SEP_2020_match$nombre_region)<-"Chilean Region of the Center"
label(CONS_C1_df_dup_SEP_2020_match$tipo_centro_pub)<-"Type of Center (Public)"
label(CONS_C1_df_dup_SEP_2020_match$abandono_temprano_rec)<-"Early Dropout"
label(CONS_C1_df_dup_SEP_2020_match$tipo_de_plan_res)<-"Residential Type of Plan"
label(CONS_C1_df_dup_SEP_2020_match$evaluacindelprocesoteraputico_min)<-"Evaluation of the Therapeutic Process (Min. Achievement)"
label(CONS_C1_df_dup_SEP_2020_match$prev_treat)<-"No. of Previous Treatments"
label(CONS_C1_df_dup_SEP_2020_match$sexo_2)<-"Sex"
label(CONS_C1_df_dup_SEP_2020_match$edad_al_ing)<-"Age at Admission"
label(CONS_C1_df_dup_SEP_2020_match$fech_ing_num)<-"Date of Admission to Treatment (Numeric)"

#origen_ingreso #dg_global_nec_int_soc_or_1 "Diagnóstico global de necesidades de integración social" #evaluacindelprocesoteraputico "Evaluación del proceso terapéutico" #escolaridad_rec "macrozona"

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

#CONS_C1_df_dup_SEP_2020 %>% janitor::tabyl(evaluacindelprocesoteraputico) 
#CONS_C1_df_dup_SEP_2020 %>% janitor::tabyl(evaluacindelprocesoteraputico) 
#CONS_C1_df_dup_SEP_2020 %>% janitor::tabyl(nombre_region)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

library(Amelia)
CONS_C1_df_dup_SEP_2020_match<-
CONS_C1_df_dup_SEP_2020_match %>% 
    dplyr::group_by(hash_key) %>% 
    dplyr::mutate(rn=row_number()) %>% 
    dplyr::ungroup() %>% 
    data.table::data.table()

dataset_match_miss_sus_prin<-  
  CONS_C1_df_dup_SEP_2020 %>% 
    dplyr::filter(row %in% unlist(CONS_C1_df_dup_SEP_2020_match$row)) %>% 
      dplyr::select(row, edad_ini_sus_prin,via_adm_sus_prin_act,compromiso_biopsicosocial)
  
  #HACER BASE ESPECIAL QUE CONTENGA UNA VARIABLE DE EDAD DE INICIO DE CONSUMO DE SUSTANCIA PRINCIPAL PARA EQUIPARAR
  CONS_C1_df_dup_SEP_2020_match_miss<-
    CONS_C1_df_dup_SEP_2020_match %>%
    dplyr::left_join(dataset_match_miss_sus_prin,by="row")
  
amelia_fit <- amelia(CONS_C1_df_dup_SEP_2020_match_miss, 
                     m=30, parallel = "multicore", #noms = "row",
                     idvars=c("row"),#"hash_key","rn"
                     noms= c("sus_ini_mod_mvv", "estado_conyugal_2", "escolaridad_rec", "estatus_ocupacional_rec","via_adm_sus_prin_act", "freq_cons_sus_prin", "origen_ingreso_mod",  "nombre_region", "sexo_2","compromiso_biopsicosocial"),
                     ords= c("dg_cie_10_rec", "prev_treat"),
                     cs = "hash_key", ts = "rn")
-- Imputation 1 --

  1  2  3  4

-- Imputation 2 --

  1  2  3  4

-- Imputation 3 --

  1  2  3  4

-- Imputation 4 --

  1  2  3  4

-- Imputation 5 --

  1  2  3

-- Imputation 6 --

  1  2  3  4

-- Imputation 7 --

  1  2  3  4

-- Imputation 8 --

  1  2  3  4

-- Imputation 9 --

  1  2  3  4

-- Imputation 10 --

  1  2  3  4

-- Imputation 11 --

  1  2  3  4

-- Imputation 12 --

  1  2  3  4

-- Imputation 13 --

  1  2  3  4

-- Imputation 14 --

  1  2  3  4

-- Imputation 15 --

  1  2  3  4

-- Imputation 16 --

  1  2  3  4

-- Imputation 17 --

  1  2  3  4

-- Imputation 18 --

  1  2  3  4

-- Imputation 19 --

  1  2  3  4

-- Imputation 20 --

  1  2  3  4

-- Imputation 21 --

  1  2  3  4

-- Imputation 22 --

  1  2  3  4

-- Imputation 23 --

  1  2  3  4

-- Imputation 24 --

  1  2  3  4

-- Imputation 25 --

  1  2  3  4

-- Imputation 26 --

  1  2  3  4

-- Imputation 27 --

  1  2  3  4

-- Imputation 28 --

  1  2  3  4

-- Imputation 29 --

  1  2  3  4

-- Imputation 30 --

  1  2  3  4
#amelia_fit$imputations$imp1
#CONS_C1_df_dup_SEP_2020_match[!complete.cases(CONS_C1_df_dup_SEP_2020_match[,..match.on_tot])]

compare.density(amelia_fit,var="edad_ini_cons")
Figure 3. Bar plot of Porcentaje of Missing Values per Variables

Figure 3. Bar plot of Porcentaje of Missing Values per Variables


Based on the figure above, the age of onset of drug use was similar between the imputed values and the observed. However, there were two logical conditions to fulfill in order to replace adequately these values in the database: the age of onset must not be greater than the age of onset of drug use in the primary substance at admission, and may not be greater than the age of admission to treatment. We selected the minimum value of age of onset of drug use among the imputed, because one user could not have more than one age of onset of drug use.


#On this graph, a y = x line indicates the line of perfect agreement; that is, if the imputation model was a perfect predictor of the true value, all the imputations would fall on this line
no_mostrar=0
if(no_mostrar==1){
  res <- { 
    setTimeLimit(nn_K*500)
    ovr_imp_edad_ini_cons<-overimpute(amelia_fit, var = "edad_ini_cons")
  }
}
#Hay poca relación en las imputaciones.

# Ver si alguno de los usuarios con valores perdidos tiene de todas formas datos en esta variable.
edad_ini_sus_prin_for_imp<-
CONS_C1_df_dup_SEP_2020 %>% 
    dplyr::filter( hash_key %in% unlist(unique(CONS_C1_df_dup_SEP_2020_match[which(!complete.cases(CONS_C1_df_dup_SEP_2020_match$edad_ini_cons)),"hash_key"]))) %>%
    dplyr::filter(!is.na(edad_ini_sus_prin)) %>% 
    dplyr::group_by(hash_key) %>% 
    summarise(min_edad_ini_sus_prin=min(edad_ini_sus_prin, na.rm=T), edad_al_ing_min=min(edad_al_ing,na.rm=T))

cumplimiento_errores<-data.frame()
for (i in paste0("imp",1:30)){
  rn_cum_err<-data.frame(amelia_fit$imputations[i]) %>% 
    dplyr::rename(hash_key = 2) %>% 
    dplyr::rename(edad_ini_cons = 7) %>% 
    dplyr::mutate(hash_key=as.character(hash_key)) %>% 
    dplyr::left_join(edad_ini_sus_prin_for_imp, by="hash_key") %>% 
    dplyr::filter(edad_al_ing_min<edad_ini_cons|edad_ini_cons>min_edad_ini_sus_prin) %>% # edad de inicio de consumo no debe ser mayor a la menor edad a la ingreso de cada usuario, y la mínima edad de inicio de consumo de sustancia principal es menor a la edad de inicio de consumo
    nrow()
  rn_cum_err<-cbind(i,rn_cum_err)
 cumplimiento_errores<- rbind(cumplimiento_errores,rn_cum_err)
}
colnames(cumplimiento_errores)<- c("imp","no_errors_age_of_onset_drug_use")

paste0("Number of users that had different age of onset of drug use before replacement: ",CONS_C1_df_dup_SEP_2020_match %>% 
    dplyr::group_by(hash_key) %>% 
    dplyr::mutate(n_dis=n_distinct(edad_ini_cons)) %>% 
    dplyr::ungroup() %>% 
    dplyr::filter(n_dis>1) %>% 
      nrow())
[1] "Number of users that had different age of onset of drug use before replacement: 0"
n_miss_edad_ini_cons<-nrow(CONS_C1_df_dup_SEP_2020_match[which(!complete.cases(CONS_C1_df_dup_SEP_2020_match$edad_ini_cons)),"hash_key"])
plot_imps<-
cumplimiento_errores %>%
  dplyr::mutate(no_errors_age_of_onset_drug_use=as.numeric(no_errors_age_of_onset_drug_use)) %>% 
  dplyr::mutate(imp=factor(imp, levels =paste0("imp",1:30))) %>% 
  dplyr::mutate(perc= no_errors_age_of_onset_drug_use/n_miss_edad_ini_cons) %>% 
  dplyr::mutate(label_text= paste0("Variable= ",imp,"<br>n= ",no_errors_age_of_onset_drug_use,"<br>",scales::percent(round(perc,2)))) %>%
  dplyr::mutate(perc=perc*100) %>% 
  ggplot() +
  geom_bar(aes(x=imp, y=perc,label= label_text), stat = 'identity') +
  sjPlot::theme_sjplot()+
#  scale_y_continuous(limits=c(0,1), labels=percent)+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=9))+
  labs(x=NULL, y="% of Values With Logical Discrepancies")+
  theme(aspect.ratio = 2/1)
  ggplotly(plot_imps, tooltip = c("label_text"))%>% layout(xaxis= list(showticklabels = T)) %>%   layout(yaxis = list(tickformat='%',  range = c(0, 40)), height = 600, width=800) 

Figure 4. Bar plot of Percentage of Incorrect Imputed Values per Imputation Sample

edad_ini_cons_imputed<-
  cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$edad_ini_cons,
       amelia_fit$imputations$imp2$edad_ini_cons,
       amelia_fit$imputations$imp3$edad_ini_cons,
       amelia_fit$imputations$imp4$edad_ini_cons,
       amelia_fit$imputations$imp5$edad_ini_cons,
       amelia_fit$imputations$imp6$edad_ini_cons,
       amelia_fit$imputations$imp7$edad_ini_cons,
       amelia_fit$imputations$imp8$edad_ini_cons,
       amelia_fit$imputations$imp9$edad_ini_cons,
       amelia_fit$imputations$imp10$edad_ini_cons,
       amelia_fit$imputations$imp11$edad_ini_cons,
       amelia_fit$imputations$imp12$edad_ini_cons,
       amelia_fit$imputations$imp13$edad_ini_cons,
       amelia_fit$imputations$imp14$edad_ini_cons,
       amelia_fit$imputations$imp15$edad_ini_cons,
       amelia_fit$imputations$imp16$edad_ini_cons,
       amelia_fit$imputations$imp17$edad_ini_cons,
       amelia_fit$imputations$imp18$edad_ini_cons,
       amelia_fit$imputations$imp19$edad_ini_cons,
       amelia_fit$imputations$imp20$edad_ini_cons,
       amelia_fit$imputations$imp21$edad_ini_cons,
       amelia_fit$imputations$imp22$edad_ini_cons,
       amelia_fit$imputations$imp23$edad_ini_cons,
       amelia_fit$imputations$imp24$edad_ini_cons,
       amelia_fit$imputations$imp25$edad_ini_cons,
       amelia_fit$imputations$imp26$edad_ini_cons,
       amelia_fit$imputations$imp27$edad_ini_cons,
       amelia_fit$imputations$imp28$edad_ini_cons,
       amelia_fit$imputations$imp29$edad_ini_cons,
       amelia_fit$imputations$imp30$edad_ini_cons
       ) 

edad_ini_cons_imputed<-cbind(data.table(edad_ini_cons_imputed[,1],data.table(edad_ini_cons_imputed[,2:31])[, AVG := rowMeans(.SD, na.rm=T)]))%>% 
  janitor::clean_names() %>%
 mutate(Min = dplyr::select(., amelia_fit_imputations_imp1_edad_ini_cons:amelia_fit_imputations_imp30_edad_ini_cons) %>% purrr::reduce(pmin))
  

# Reemplazo los valores perdidos:
CONS_C1_df_dup_SEP_2020_match_miss1<-
CONS_C1_df_dup_SEP_2020_match_miss %>% 
  dplyr::left_join(edad_ini_cons_imputed,by=c("row"="v1")) %>% 
  dplyr::group_by(hash_key) %>% 
  dplyr::mutate(min_edad_al_ing=min(edad_al_ing,na.rm=T),min_edad_al_ing=ifelse(is.infinite(min_edad_al_ing), NA, min_edad_al_ing)) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(edad_ini_cons=dplyr::case_when(is.na(edad_ini_cons) & as.numeric(avg)<edad_ini_sus_prin & as.numeric(avg)<min_edad_al_ing~as.numeric(avg),is.na(edad_ini_cons) & as.numeric(Min)<edad_ini_sus_prin & as.numeric(Min)<min_edad_al_ing~as.numeric(Min),
  is.na(edad_ini_cons) & is.na(edad_ini_sus_prin) & avg<as.numeric(min_edad_al_ing)~as.numeric(avg),is.na(edad_ini_cons) & is.na(edad_ini_sus_prin) & Min<as.numeric(min_edad_al_ing)~as.numeric(Min),
                                               TRUE~as.numeric(edad_ini_cons)))

#is.na(edad_ini_cons) & is.na(edad_ini_sus_prin) & is.na(min_edad_al_ing)~as.numeric(avg),
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss1$edad_ini_cons))
paste0("Number of rows with values that did not fulfilled the conditions: ",CONS_C1_df_dup_SEP_2020_match_miss1 %>% 
    dplyr::filter(is.na(edad_ini_cons)) %>% 
    dplyr::select(hash_key, edad_ini_cons, min_edad_al_ing,edad_ini_sus_prin,min_edad_al_ing, avg, Min) %>% nrow())
[1] "Number of rows with values that did not fulfilled the conditions: 9"
CONS_C1_df_dup_SEP_2020_match_miss1<-
CONS_C1_df_dup_SEP_2020_match_miss1 %>% 
    dplyr::group_by(hash_key) %>% 
    dplyr::mutate(n_dis=n_distinct(edad_ini_cons)) %>% 
    dplyr::ungroup() %>% 
    dplyr::group_by(hash_key) %>% 
  dplyr::mutate(min_edad_ini_cons=min(edad_ini_cons,na.rm=T), edad_ini_cons= dplyr::case_when(n_dis>1 & !is.na(min_edad_ini_cons)~min_edad_ini_cons,TRUE~edad_ini_cons))

paste0("Number of users that had different age of onset of drug use after replacement: ",CONS_C1_df_dup_SEP_2020_match_miss1 %>% 
    dplyr::group_by(hash_key) %>% 
    dplyr::mutate(n_dis=n_distinct(edad_ini_cons)) %>% 
    dplyr::ungroup() %>% 
    dplyr::filter(n_dis>1) %>% 
      nrow())
[1] "Number of users that had different age of onset of drug use after replacement: 0"


There were 9 cases of imputed ages of onset of drug use that did not fulfilled the conditions necessary to replace the missing values with the imputed ones. We also looked over the


#On this graph, a y = x line indicates the line of perfect agreement; that is, if the imputation model was a perfect predictor of the true value, all the imputations would fall on this line
no_mostrar=0
if(no_mostrar==1){
  res <- { 
    setTimeLimit(nn_K*500)
    ovr_imp_edad_ini_cons<-overimpute(amelia_fit, var = "edad_al_ing")
  }
}
#Hay poca relación en las imputaciones.
#table(is.na(CONS_C1_df_dup_SEP_2020_match_not_miss$edad_al_ing),exclude=NULL)

edad_al_ing_imputed<-
  cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$edad_al_ing,
       amelia_fit$imputations$imp2$edad_al_ing,
       amelia_fit$imputations$imp3$edad_al_ing,
       amelia_fit$imputations$imp4$edad_al_ing,
       amelia_fit$imputations$imp5$edad_al_ing,
       amelia_fit$imputations$imp6$edad_al_ing,
       amelia_fit$imputations$imp7$edad_al_ing,
       amelia_fit$imputations$imp8$edad_al_ing,
       amelia_fit$imputations$imp9$edad_al_ing,
       amelia_fit$imputations$imp10$edad_al_ing,
       amelia_fit$imputations$imp11$edad_al_ing,
       amelia_fit$imputations$imp12$edad_al_ing,
       amelia_fit$imputations$imp13$edad_al_ing,
       amelia_fit$imputations$imp14$edad_al_ing,
       amelia_fit$imputations$imp15$edad_al_ing,
       amelia_fit$imputations$imp16$edad_al_ing,
       amelia_fit$imputations$imp17$edad_al_ing,
       amelia_fit$imputations$imp18$edad_al_ing,
       amelia_fit$imputations$imp19$edad_al_ing,
       amelia_fit$imputations$imp20$edad_al_ing,
       amelia_fit$imputations$imp21$edad_al_ing,
       amelia_fit$imputations$imp22$edad_al_ing,
       amelia_fit$imputations$imp23$edad_al_ing,
       amelia_fit$imputations$imp24$edad_al_ing,
       amelia_fit$imputations$imp25$edad_al_ing,
       amelia_fit$imputations$imp26$edad_al_ing,
       amelia_fit$imputations$imp27$edad_al_ing,
       amelia_fit$imputations$imp28$edad_al_ing,
       amelia_fit$imputations$imp29$edad_al_ing,
       amelia_fit$imputations$imp30$edad_al_ing
       ) 

edad_al_ing_imputed<-cbind(data.table(edad_al_ing_imputed[,1],data.table(edad_al_ing_imputed[,2:31])[, AVG_edad_ing := rowMeans(.SD, na.rm=T)]))%>% 
  janitor::clean_names() %>%
 mutate(Min_edad_ing = dplyr::select(., amelia_fit_imputations_imp1_edad_al_ing:amelia_fit_imputations_imp30_edad_al_ing) %>% purrr::reduce(pmin))
  

# Reemplazo los valores perdidos:
CONS_C1_df_dup_SEP_2020_match_miss12<-
CONS_C1_df_dup_SEP_2020_match_miss1 %>% 
  dplyr::left_join(edad_al_ing_imputed,by=c("row"="v1")) %>% 
  dplyr::group_by(hash_key) %>% 
  dplyr::mutate(min_edad_ini_cons=min(edad_ini_cons,na.rm=T),min_edad_ini_cons=ifelse(is.infinite(min_edad_ini_cons), NA, min_edad_ini_cons)) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(edad_al_ing=dplyr::case_when(is.na(edad_al_ing) & as.numeric(avg_edad_ing)>=min_edad_ini_cons & as.numeric(avg_edad_ing)>=edad_ini_sus_prin~as.numeric(avg_edad_ing),is.na(edad_al_ing) & as.numeric(Min_edad_ing)>=edad_ini_sus_prin & as.numeric(Min_edad_ing)>=edad_ini_sus_prin~as.numeric(Min_edad_ing),is.na(edad_al_ing) & is.na(edad_ini_sus_prin) & avg_edad_ing>=as.numeric(min_edad_ini_cons)~as.numeric(avg_edad_ing),is.na(edad_al_ing) & is.na(edad_ini_sus_prin) & Min_edad_ing>=as.numeric(min_edad_ini_cons)~as.numeric(Min_edad_ing),TRUE~as.numeric(edad_al_ing)))

no_mostrar=0
if(no_mostrar==1){
  try(
  CONS_C1_df_dup_SEP_2020_match_miss1 %>% 
      dplyr::left_join(edad_al_ing_imputed,by=c("row"="v1")) %>% 
      dplyr::group_by(hash_key) %>% 
      dplyr::mutate(min_edad_ini_cons=min(edad_ini_cons,na.rm=T),min_edad_ini_cons=ifelse(is.infinite(min_edad_ini_cons), NA, min_edad_ini_cons)) %>% 
      dplyr::ungroup() %>% 
      dplyr::filter(is.na(edad_al_ing)) %>% 
      dplyr::select(hash_key, rn, edad_al_ing, avg_edad_ing, Min_edad_ing,min_edad_ini_cons,edad_ini_sus_prin)
  )
}
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss12$edad_al_ing))


We selected the most vulnerable value (First, Cocaine paste, Cocaine hydrochloride or snort cocaine, Marijuana, Alcohol, and Other).


# Ver distintos valores propuestos para sustancia de inciio
sus_ini_mod_mvv_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$sus_ini_mod_mvv,
       amelia_fit$imputations$imp2$sus_ini_mod_mvv,
       amelia_fit$imputations$imp3$sus_ini_mod_mvv,
       amelia_fit$imputations$imp4$sus_ini_mod_mvv,
       amelia_fit$imputations$imp5$sus_ini_mod_mvv,
       amelia_fit$imputations$imp6$sus_ini_mod_mvv,
       amelia_fit$imputations$imp7$sus_ini_mod_mvv,
       amelia_fit$imputations$imp8$sus_ini_mod_mvv,
       amelia_fit$imputations$imp9$sus_ini_mod_mvv,
       amelia_fit$imputations$imp10$sus_ini_mod_mvv,
       amelia_fit$imputations$imp11$sus_ini_mod_mvv,
       amelia_fit$imputations$imp12$sus_ini_mod_mvv,
       amelia_fit$imputations$imp13$sus_ini_mod_mvv,
       amelia_fit$imputations$imp14$sus_ini_mod_mvv,
       amelia_fit$imputations$imp15$sus_ini_mod_mvv,
       amelia_fit$imputations$imp16$sus_ini_mod_mvv,
       amelia_fit$imputations$imp17$sus_ini_mod_mvv,
       amelia_fit$imputations$imp18$sus_ini_mod_mvv,
       amelia_fit$imputations$imp19$sus_ini_mod_mvv,
       amelia_fit$imputations$imp20$sus_ini_mod_mvv,
       amelia_fit$imputations$imp21$sus_ini_mod_mvv,
       amelia_fit$imputations$imp22$sus_ini_mod_mvv,
       amelia_fit$imputations$imp23$sus_ini_mod_mvv,
       amelia_fit$imputations$imp24$sus_ini_mod_mvv,
       amelia_fit$imputations$imp25$sus_ini_mod_mvv,
       amelia_fit$imputations$imp26$sus_ini_mod_mvv,
       amelia_fit$imputations$imp27$sus_ini_mod_mvv,
       amelia_fit$imputations$imp28$sus_ini_mod_mvv,
       amelia_fit$imputations$imp29$sus_ini_mod_mvv,
       amelia_fit$imputations$imp30$sus_ini_mod_mvv
       ) 

sus_ini_mod_mvv_imputed<-
sus_ini_mod_mvv_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Marijuana",as.character(.))~1,TRUE~0), .names="mar_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Alcohol",as.character(.))~1,TRUE~0), .names="oh_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Cocaine paste",as.character(.))~1,TRUE~0), .names="pb_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Cocaine hydrochloride",as.character(.))~1,TRUE~0), .names="coc_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.sus_ini_mod_mvv:amelia_fit.imputations.imp30.sus_ini_mod_mvv),~dplyr::case_when(grepl("Other",as.character(.))~1,TRUE~0), .names="otr_{col}"))%>%
        dplyr::mutate(sus_ini_mod_mvv_mar = base::rowSums(dplyr::select(., starts_with("mar_"))))%>%
  dplyr::mutate(sus_ini_mod_mvv_oh = base::rowSums(dplyr::select(., starts_with("oh_"))))%>%
  dplyr::mutate(sus_ini_mod_mvv_pb = base::rowSums(dplyr::select(., starts_with("pb_"))))%>%
  dplyr::mutate(sus_ini_mod_mvv_coc = base::rowSums(dplyr::select(., starts_with("coc_"))))%>%
  dplyr::mutate(sus_ini_mod_mvv_otr = base::rowSums(dplyr::select(., starts_with("otr_")))) %>% 
  #dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_mar>0~1,TRUE~0)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_oh>0~sus_ini_mod_mvv_tot+1,TRUE~sus_ini_mod_mvv_tot)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_pb>0~sus_ini_mod_mvv_tot+1,TRUE~sus_ini_mod_mvv_tot)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_coc>0~sus_ini_mod_mvv_tot+1,TRUE~sus_ini_mod_mvv_tot)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_tot=dplyr::case_when(sus_ini_mod_mvv_otr>0~sus_ini_mod_mvv_tot+1,TRUE~sus_ini_mod_mvv_tot)) %>% 
  dplyr::mutate(sus_ini_mod_mvv_to_imputation=dplyr::case_when(sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_pb>0~"Cocaine paste",
                                                               sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_coc>0~"Cocaine hydrochloride",
                                                               sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_mar>0~"Marijuana",
                                                               sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_oh>0~"Alcohol",
                                                               sus_ini_mod_mvv_tot==1 & sus_ini_mod_mvv_otr>0~"Other",
                                                                
                                                               sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_pb>0~"Cocaine paste",
                                                               sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_coc>0~"Cocaine hydrochloride",
                                                                sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_mar>0~"Marijuana",
                                                                sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_oh>0~"Alcohol",
                                                                sus_ini_mod_mvv_tot>1 & sus_ini_mod_mvv_otr>0~"Other")) %>% 
  janitor::clean_names()

sus_ini_mod_mvv_imputed<-
dplyr::select(sus_ini_mod_mvv_imputed,amelia_fit_imputations_imp1_row,sus_ini_mod_mvv_to_imputation)

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss2<-
CONS_C1_df_dup_SEP_2020_match_miss1 %>% 
   dplyr::left_join(sus_ini_mod_mvv_imputed, by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(sus_ini_mod_mvv=factor(dplyr::case_when(is.na(sus_ini_mod_mvv)~as.character(sus_ini_mod_mvv_to_imputation),
                                 TRUE~as.character(sus_ini_mod_mvv)))) %>% 
  data.table()
 
#_#_#_#_#_#_#__#_##_#_#_#_#_#_#_#_#_#_#_#_#__#_##_#_#_#_#_##_#_#_#_#_#_#__#_##_#_#_#_#_#_#_#_#_#_#_#_#__#_##_#_#_#_#_#
#_#_#_#_#_#_#__#_##_#_#_#_#_#_#_#_#_#_#_#_#__#_##_#_#_#_#_##_#_#_#_#_#_#__#_##_#_#_#_#_#_#_#_#_#_#_#_#__#_##_#_#_#_#_#


Another variable that is worth imputing is the Frequency of use of primary drug at admission. We selected the imputed values with the value with the most frequent drug use.


# Ver distintos valores propuestos para sustancia de inciio
freq_cons_sus_prin_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$freq_cons_sus_prin,
       amelia_fit$imputations$imp2$freq_cons_sus_prin,
       amelia_fit$imputations$imp3$freq_cons_sus_prin,
       amelia_fit$imputations$imp4$freq_cons_sus_prin,
       amelia_fit$imputations$imp5$freq_cons_sus_prin,
       amelia_fit$imputations$imp6$freq_cons_sus_prin,
       amelia_fit$imputations$imp7$freq_cons_sus_prin,
       amelia_fit$imputations$imp8$freq_cons_sus_prin,
       amelia_fit$imputations$imp9$freq_cons_sus_prin,
       amelia_fit$imputations$imp10$freq_cons_sus_prin,
       amelia_fit$imputations$imp11$freq_cons_sus_prin,
       amelia_fit$imputations$imp12$freq_cons_sus_prin,
       amelia_fit$imputations$imp13$freq_cons_sus_prin,
       amelia_fit$imputations$imp14$freq_cons_sus_prin,
       amelia_fit$imputations$imp15$freq_cons_sus_prin,
       amelia_fit$imputations$imp16$freq_cons_sus_prin,
       amelia_fit$imputations$imp17$freq_cons_sus_prin,
       amelia_fit$imputations$imp18$freq_cons_sus_prin,
       amelia_fit$imputations$imp19$freq_cons_sus_prin,
       amelia_fit$imputations$imp20$freq_cons_sus_prin,
       amelia_fit$imputations$imp21$freq_cons_sus_prin,
       amelia_fit$imputations$imp22$freq_cons_sus_prin,
       amelia_fit$imputations$imp23$freq_cons_sus_prin,
       amelia_fit$imputations$imp24$freq_cons_sus_prin,
       amelia_fit$imputations$imp25$freq_cons_sus_prin,
       amelia_fit$imputations$imp26$freq_cons_sus_prin,
       amelia_fit$imputations$imp27$freq_cons_sus_prin,
       amelia_fit$imputations$imp28$freq_cons_sus_prin,
       amelia_fit$imputations$imp29$freq_cons_sus_prin,
       amelia_fit$imputations$imp30$freq_cons_sus_prin
       ) 

freq_cons_sus_prin_imputed<-
freq_cons_sus_prin_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("1 day a week or more",as.character(.))~1,TRUE~0), .names="1_day_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("2 to 3 days a week",as.character(.))~1,TRUE~0), .names="2_3_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("4 to 6 days a week",as.character(.))~1,TRUE~0), .names="4_6_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("Less than 1 day a week",as.character(.))~1,TRUE~0), .names="less_1_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("Did not use",as.character(.))~1,TRUE~0), .names="did_not_{col}"))%>%
    dplyr::mutate(across(c(amelia_fit.imputations.imp1.freq_cons_sus_prin:amelia_fit.imputations.imp30.freq_cons_sus_prin),~dplyr::case_when(grepl("Daily",as.character(.))~1,TRUE~0), .names="daily_{col}"))%>%
  dplyr::mutate(freq_cons_sus_prin_daily = base::rowSums(dplyr::select(., starts_with("daily_")))) %>% 
  dplyr::mutate(freq_cons_sus_prin_4_6 = base::rowSums(dplyr::select(., starts_with("4_6_"))))%>%
  dplyr::mutate(freq_cons_sus_prin_2_3 = base::rowSums(dplyr::select(., starts_with("2_3_"))))%>%
  dplyr::mutate(freq_cons_sus_prin_1_day = base::rowSums(dplyr::select(., starts_with("1_day_"))))%>%
  dplyr::mutate(freq_cons_sus_prin_less_1 = base::rowSums(dplyr::select(., starts_with("less_1_"))))%>%
  dplyr::mutate(freq_cons_sus_prin_did_not = base::rowSums(dplyr::select(., starts_with("did_not_")))) %>% 
  #dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_1_day>0~1,TRUE~0)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_2_3>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_4_6>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_less_1>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_did_not>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  dplyr::mutate(freq_cons_sus_prin_tot=dplyr::case_when(freq_cons_sus_prin_daily>0~freq_cons_sus_prin_tot+1,TRUE~freq_cons_sus_prin_tot)) %>% 
  #hierarchy
  dplyr::mutate(freq_cons_sus_prin_to_imputation=
                  dplyr::case_when(freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_daily>0~"Daily",
                                     freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_4_6>0~"4 to 6 days a week",
                                     freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_2_3>0~"2 to 3 days a week",
                                     freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_1_day>0~"1 day a week or more",
                                     freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_less_1>0~"Less than 1 day a week",
                                     freq_cons_sus_prin_tot==1 & freq_cons_sus_prin_did_not>0~"Did not use",
                                                                
                                     freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_daily>0~"Daily",
                                     freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_4_6>0~"4 to 6 days a week",
                                     freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_2_3>0~"2 to 3 days a week",
                                     freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_1_day>0~"1 day a week or more",
                                     freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_less_1>0~"Less than 1 day a week",
                                     freq_cons_sus_prin_tot>1 & freq_cons_sus_prin_did_not>0~"Did not use"
                                   )) %>% 
  janitor::clean_names()

freq_cons_sus_prin_imputed<-
dplyr::select(freq_cons_sus_prin_imputed,amelia_fit_imputations_imp1_row,freq_cons_sus_prin_to_imputation)


#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss3<-
CONS_C1_df_dup_SEP_2020_match_miss2 %>% 
   dplyr::left_join(freq_cons_sus_prin_imputed, by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(freq_cons_sus_prin=factor(dplyr::case_when(is.na(freq_cons_sus_prin)~as.character(freq_cons_sus_prin_to_imputation), TRUE~as.character(freq_cons_sus_prin)))) %>% 
  data.table()


Another variable that is worth imputing is the Educational Attainment. We were particularly cautious to impute attainments that would follow a progression from primary school to more than high school.


# Ver distintos valores propuestos para sustancia de inciio
escolaridad_rec_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
                  amelia_fit$imputations$imp1$hash_key,
                  amelia_fit$imputations$imp1$fech_ing_num,
                  amelia_fit$imputations$imp1$escolaridad_rec,
                  amelia_fit$imputations$imp2$escolaridad_rec,
                  amelia_fit$imputations$imp3$escolaridad_rec,
                  amelia_fit$imputations$imp4$escolaridad_rec,
                  amelia_fit$imputations$imp5$escolaridad_rec,
                  amelia_fit$imputations$imp6$escolaridad_rec,
                  amelia_fit$imputations$imp7$escolaridad_rec,
                  amelia_fit$imputations$imp8$escolaridad_rec,
                  amelia_fit$imputations$imp9$escolaridad_rec,
                  amelia_fit$imputations$imp10$escolaridad_rec,
                  amelia_fit$imputations$imp11$escolaridad_rec,
                  amelia_fit$imputations$imp12$escolaridad_rec,
                  amelia_fit$imputations$imp13$escolaridad_rec,
                  amelia_fit$imputations$imp14$escolaridad_rec,
                  amelia_fit$imputations$imp15$escolaridad_rec,
                  amelia_fit$imputations$imp16$escolaridad_rec,
                  amelia_fit$imputations$imp17$escolaridad_rec,
                  amelia_fit$imputations$imp18$escolaridad_rec,
                  amelia_fit$imputations$imp19$escolaridad_rec,
                  amelia_fit$imputations$imp20$escolaridad_rec,
                  amelia_fit$imputations$imp21$escolaridad_rec,
                  amelia_fit$imputations$imp22$escolaridad_rec,
                  amelia_fit$imputations$imp23$escolaridad_rec,
                  amelia_fit$imputations$imp24$escolaridad_rec,
                  amelia_fit$imputations$imp25$escolaridad_rec,
                  amelia_fit$imputations$imp26$escolaridad_rec,
                  amelia_fit$imputations$imp27$escolaridad_rec,
                  amelia_fit$imputations$imp28$escolaridad_rec,
                  amelia_fit$imputations$imp29$escolaridad_rec,
                  amelia_fit$imputations$imp30$escolaridad_rec,
                  amelia_fit$imputations$imp30$rn
               ) 

escolaridad_rec_imputed<-
escolaridad_rec_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.escolaridad_rec:amelia_fit.imputations.imp30.escolaridad_rec),~dplyr::case_when(grepl("3-Completed primary school or less",as.character(.))~1,TRUE~0), .names="3_primary_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.escolaridad_rec:amelia_fit.imputations.imp30.escolaridad_rec),~dplyr::case_when(grepl("2-Completed high school or less",as.character(.))~1,TRUE~0), .names="2_high_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.escolaridad_rec:amelia_fit.imputations.imp30.escolaridad_rec),~dplyr::case_when(grepl("1-More than high school",as.character(.))~1,TRUE~0), .names="1_more_high_{col}"))%>%

  dplyr::mutate(escolaridad_rec_3_primary = base::rowSums(dplyr::select(., contains("3_primary_")))) %>% 
  dplyr::mutate(escolaridad_rec_2_high = base::rowSums(dplyr::select(., contains("2_high_"))))%>%
  dplyr::mutate(escolaridad_rec_1_more_high = base::rowSums(dplyr::select(., contains("1_more_high_"))))%>%
  
  dplyr::left_join(cbind.data.frame(CONS_C1_df_dup_SEP_2020_match$row, CONS_C1_df_dup_SEP_2020_match$escolaridad_rec),by=c("amelia_fit.imputations.imp1.row"="CONS_C1_df_dup_SEP_2020_match$row")) %>%
  dplyr::rename("escolaridad_rec_original"="CONS_C1_df_dup_SEP_2020_match$escolaridad_rec") %>%
  dplyr::mutate(escolaridad_rec_original=as.numeric(substr(escolaridad_rec_original, 1, 1))) %>%
  dplyr::arrange(amelia_fit.imputations.imp1.hash_key,amelia_fit.imputations.imp30.rn) %>% 
  dplyr::group_by(amelia_fit.imputations.imp1.hash_key) %>%  
  dplyr::mutate(siguiente_escolaridad_rec_original=lead(escolaridad_rec_original), 
                subsig_escolaridad_rec_original=lead(escolaridad_rec_original,n =2), 
                rn=max(amelia_fit.imputations.imp30.rn),
                n_na_esc_or=is.na(escolaridad_rec_original),
                sum_n_na_esc_or=sum(n_na_esc_or,na.rm=T),
                max_sum_n_na_esc_or=max(n_na_esc_or,na.rm=T)
                ) %>% 
#dplyr::select(amelia_fit.imputations.imp1.hash_key,amelia_fit.imputations.imp30.rn,
#              siguiente_escolaridad_rec_original,escolaridad_rec_original,amelia_fit.imputations.imp1.fech_ing_num)%>% View()
  dplyr::ungroup() %>% 
#dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(escolaridad_rec_tot=dplyr::case_when(escolaridad_rec_3_primary>0~1,TRUE~0)) %>% 
  dplyr::mutate(escolaridad_rec_tot=dplyr::case_when(escolaridad_rec_2_high>0~escolaridad_rec_tot+1,TRUE~escolaridad_rec_tot)) %>% 
  dplyr::mutate(escolaridad_rec_tot=dplyr::case_when(escolaridad_rec_1_more_high>0~escolaridad_rec_tot+1,TRUE~escolaridad_rec_tot)) %>% 
  #hierarchy
  dplyr::mutate(escolaridad_rec_to_imputation=
# 1A) TIENE MÁS DE UN TRAT POR USUARIO (RN) Y HAY MAS DE UN VALOR DE IMPUTACION PARA ESCOLARIDAD
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
  dplyr::case_when(
    escolaridad_rec_tot>1 &
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_3_primary>0 &
    rn>1 & !is.na(siguiente_escolaridad_rec_original) &
    siguiente_escolaridad_rec_original<=3 ~"3-Completed primary school or less",
    
    escolaridad_rec_tot>1 & 
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_2_high>0 & 
    rn>1 & !is.na(siguiente_escolaridad_rec_original) &
    siguiente_escolaridad_rec_original<=2 ~"2-Completed high school or less",
    
    escolaridad_rec_tot>1 & #hay más de un valor de imputación para escolaridad
    max_sum_n_na_esc_or==1 & #no hay más de un valor de escolaridad a imputar
    escolaridad_rec_1_more_high >0 & #el valor que tiene en este caso es más de high school
    rn>1 & !is.na(siguiente_escolaridad_rec_original) & #tiene más de un trat por usuario y no hay siguiente escolaridad perdida
    siguiente_escolaridad_rec_original==1~"1-More than high school", #si ya tenía más de universitario, debía tener el mismo valor en el siguiente,
    # 1A2) SI HAY UN PERDIDO EN EL SIGUIENTE TRATAMIENTO, PERO EL SUBSIGUIENTE SÍ TIENE UN VALOR
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    escolaridad_rec_tot>1 &
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_3_primary>0 &
    rn>1 & is.na(siguiente_escolaridad_rec_original) &
    subsig_escolaridad_rec_original<=3 ~"3-Completed primary school or less",
    
    escolaridad_rec_tot>1 & 
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_2_high>0 & 
    rn>1 & is.na(siguiente_escolaridad_rec_original) &
    subsig_escolaridad_rec_original<=2 ~"2-Completed high school or less",
    
    escolaridad_rec_tot>1 & #hay más de un valor de imputación para escolaridad
    max_sum_n_na_esc_or==1 & #no hay más de un valor de escolaridad a imputar
    escolaridad_rec_1_more_high >0 & #el valor que tiene en este caso es más de high school
    rn>1 & is.na(siguiente_escolaridad_rec_original) &
    subsig_escolaridad_rec_original==1~"1-More than high school", #si ya tenía más de universitario, debía tener el mismo valor en el siguiente,
    # 1A3) SI HAY UN PERDIDO EN EL SIGUIENTE TRATAMIENTO, PERO EL SUBSIGUIENTE NO TIENE UN VALOR (QUIZAS PORQUE NO EXISTE INCLUSO)
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    escolaridad_rec_tot>1 &
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_3_primary>0 &
    rn>1 & is.na(siguiente_escolaridad_rec_original) &
    is.na(subsig_escolaridad_rec_original) ~"3-Completed primary school or less",
    
    escolaridad_rec_tot>1 & 
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_2_high>0 & 
    rn>1 & is.na(siguiente_escolaridad_rec_original) &
    is.na(subsig_escolaridad_rec_original) ~"2-Completed high school or less",
    
    escolaridad_rec_tot>1 & #hay más de un valor de imputación para escolaridad
    max_sum_n_na_esc_or==1 & #no hay más de un valor de escolaridad a imputar
    escolaridad_rec_1_more_high >0 & #el valor que tiene en este caso es más de high school
    rn>1 & is.na(siguiente_escolaridad_rec_original) &
    is.na(subsig_escolaridad_rec_original)~"1-More than high school", #si ya tenía más de universitario, debía tener el mismo valor en el siguiente,
    
    # 1B) TIENE MÁS DE UN TRAT POR USUARIO (RN) Y NO HAY MAS DE UN VALOR DE IMPUTACION PARA ESCOLARIDAD
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    
    escolaridad_rec_tot==1 &
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_3_primary>0 &
    rn>1 & !is.na(siguiente_escolaridad_rec_original) &
    siguiente_escolaridad_rec_original<=3 ~"3-Completed primary school or less",
    
    escolaridad_rec_tot==1 & 
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_2_high>0 & 
    rn>1 & !is.na(siguiente_escolaridad_rec_original) &
    siguiente_escolaridad_rec_original<=2 ~"2-Completed high school or less",
    
    escolaridad_rec_tot==1 & #no hay más de un valor de imputación para escolaridad
    max_sum_n_na_esc_or==1 & #no hay más de un valor de escolaridad a imputar
    escolaridad_rec_1_more_high>0 & #el valor que tiene en este caso es más de high school
    rn>1 & !is.na(siguiente_escolaridad_rec_original) & #tiene más de un trat por usuario y no hay siguiente escolaridad perdida
    siguiente_escolaridad_rec_original==1~"1-More than high school", #si ya tenía más de universitario, debía tener el mismo valor en el siguiente,
    
    # 2A) TIENE SOLO UN TRAT POR USUARIO (RN) Y HAY MAS DE UN VALOR DE IMPUTACION PARA ESCOLARIDAD
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    escolaridad_rec_tot>1 &
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_3_primary>0 &
    rn==1 ~"3-Completed primary school or less",
    
    escolaridad_rec_tot>1 & 
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_2_high>0 & 
    rn==1 ~"2-Completed high school or less",
    
    escolaridad_rec_tot>1 & #hay más de un valor de imputación para escolaridad
    max_sum_n_na_esc_or==1 & #no hay más de un valor de escolaridad a imputar
    escolaridad_rec_1_more_high>0 & #el valor que tiene en este caso es más de high school
    rn==1 ~"1-More than high school", #si ya tenía más de universitario, debía tener el mismo valor en el siguiente,
    
    # 2B) TIENE SOLO UN TRAT POR USUARIO (RN) Y NO HAY MAS DE UN VALOR DE IMPUTACION PARA ESCOLARIDAD
    #_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
    
    escolaridad_rec_tot==1 &
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_3_primary>0 &
    rn==1 ~"3-Completed primary school or less",
    
    escolaridad_rec_tot==1 & 
    max_sum_n_na_esc_or==1 &
    escolaridad_rec_2_high>0 & 
    rn==1 ~"2-Completed high school or less",
    
    escolaridad_rec_tot==1 & #no hay más de un valor de imputación para escolaridad
    max_sum_n_na_esc_or==1 & #no hay más de un valor de escolaridad a imputar
    escolaridad_rec_1_more_high>0 & #el valor que tiene en este caso es más de high school
    rn==1 ~"1-More than high school" 
  )) %>% 
janitor::clean_names()

escolaridad_rec_imputed<-
dplyr::select(escolaridad_rec_imputed,amelia_fit_imputations_imp1_row,escolaridad_rec_to_imputation)

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
CONS_C1_df_dup_SEP_2020_match_miss4<-
CONS_C1_df_dup_SEP_2020_match_miss3 %>% 
   dplyr::left_join(escolaridad_rec_imputed, by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(no_info_on_esc=dplyr::case_when(is.na(escolaridad_rec)~1,TRUE~0)) %>% 
    dplyr::mutate(escolaridad_rec=factor(dplyr::case_when(is.na(escolaridad_rec)~as.character(escolaridad_rec_to_imputation), TRUE~as.character(escolaridad_rec)))) %>% 
    dplyr::arrange(hash_key,rn) %>% 
  dplyr::group_by(hash_key) %>% 
  dplyr::mutate(escolaridad_rec_num=as.numeric(substr(escolaridad_rec, 1, 1)),
                sig_escolaridad_rec_num=lead(escolaridad_rec_num),
                ant_escolaridad_rec_num=lag(escolaridad_rec_num)) %>% 
  dplyr::mutate(escolaridad_rec=factor(dplyr::case_when(#si el anterior es menos vulnerable que el presente, NA
  escolaridad_rec_num>ant_escolaridad_rec_num~NA_character_, TRUE~as.character(escolaridad_rec)))) %>% 
  dplyr::mutate(escolaridad_rec=factor(dplyr::case_when(# si es más vulnerable que el presente, es NA 
  escolaridad_rec_num<sig_escolaridad_rec_num~NA_character_, TRUE~as.character(escolaridad_rec)))) %>% 
  dplyr::ungroup() %>% 
  data.table()

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:
paste0("Check inconsistencies with posterior educational attainments (0= No inconsistencies): ",CONS_C1_df_dup_SEP_2020_match_miss4 %>% 
  dplyr::arrange(hash_key,rn) %>% 
  dplyr::group_by(hash_key) %>% 
  dplyr::mutate(escolaridad_rec_num=as.numeric(substr(escolaridad_rec, 1, 1)),
                sig_escolaridad_rec_num=lead(escolaridad_rec_num),
                ant_escolaridad_rec_num=lag(escolaridad_rec_num)) %>% 
  dplyr::ungroup() %>% 
  dplyr::filter(escolaridad_rec_num>ant_escolaridad_rec_num) %>% 
  dplyr::select(hash_key,rn,fech_ing_num, escolaridad_rec, escolaridad_rec_num, sig_escolaridad_rec_num,ant_escolaridad_rec_num,no_info_on_esc) %>% 
  nrow())
[1] "Check inconsistencies with posterior educational attainments (0= No inconsistencies): 0"
paste0("Problematic rows of users with inconsistent educational attainment that were declared as missing: ",CONS_C1_df_dup_SEP_2020_match_miss4 %>%
  dplyr::filter(hash_key %in% c("14b66a29bfc79b06c9e9a8c198259b07",
                                "16dc6c5edd96af2171127c760e8480ab",
                                "18b1f9646a2cd6bebd962637cff0a21a",
                                "296c5b02088d8d42eaa25b69e3c5bdc1",
                                "309a5876b3bd6c1eb7a326d31acc2895",
                                "35f69f3fd0dda5eab952983dd39618e2",
                                "374745c85601976177fe614a7370e475",
                                "3d722d03aaa88c6ad4c039c860c1b837",
                                "428f31fc7b7f6da87f0e8590f6e147b7",
                                "4b27553c38e707a6fda01855b784cf66",
                                "520337f7e5e3ae2e27b6e4711d4744aa",
                                "56d93af7846b383ae60019c2980ab73e",
                                "5db98f93254e218ec89689241de2f61b",
                                "6c1593d4035a5ca371148c8a19ccb30b",
                                "7cb913f6fd959aace18df3fa0fa7079e",
                                "98d6644d995ea2c8777a683160728004",
                                "9e06871d982546d078729e6154d285ba",
                                "affb5fe42db45203a63e13c54ba7551b",
                                "b715d04a584dbdd450fd6f2ea68291fc",
                                "e099bd5828923c95a1ed2878a09c1118",
                                "ed31e09b0adeefb57fa54c2295b3b7f2",
                                "f273b4cdc0cf4f046811c208162571ae",
                                "f7c1eaee409f6b1b70f64e565a616942"
  )) %>% 
  #dplyr::select(hash_key,dup, escolaridad_rec) %>% 
  dplyr::select(hash_key,rn,fech_ing_num,escolaridad_rec,escolaridad_rec_num,sig_escolaridad_rec_num,ant_escolaridad_rec_num) %>%nrow())
[1] "Problematic rows of users with inconsistent educational attainment that were declared as missing: 63"


We ended having 46 missing values in educational attainment, because these values did not fulfilled the requirements of progression of the educational attainment (eg., a user could not respond to have completed secondary school, but then answer that he had completed primary school only).


Additionally, we replaced missing values of the marital status. Since different marital status were not particularly more vulnerable between each other, we selected the most frequent imputed value among the different imputed databases.


# Ver distintos valores propuestos para estado conyugal
estado_conyugal_2_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$estado_conyugal_2,
       amelia_fit$imputations$imp2$estado_conyugal_2,
       amelia_fit$imputations$imp3$estado_conyugal_2,
       amelia_fit$imputations$imp4$estado_conyugal_2,
       amelia_fit$imputations$imp5$estado_conyugal_2,
       amelia_fit$imputations$imp6$estado_conyugal_2,
       amelia_fit$imputations$imp7$estado_conyugal_2,
       amelia_fit$imputations$imp8$estado_conyugal_2,
       amelia_fit$imputations$imp9$estado_conyugal_2,
       amelia_fit$imputations$imp10$estado_conyugal_2,
       amelia_fit$imputations$imp11$estado_conyugal_2,
       amelia_fit$imputations$imp12$estado_conyugal_2,
       amelia_fit$imputations$imp13$estado_conyugal_2,
       amelia_fit$imputations$imp14$estado_conyugal_2,
       amelia_fit$imputations$imp15$estado_conyugal_2,
       amelia_fit$imputations$imp16$estado_conyugal_2,
       amelia_fit$imputations$imp17$estado_conyugal_2,
       amelia_fit$imputations$imp18$estado_conyugal_2,
       amelia_fit$imputations$imp19$estado_conyugal_2,
       amelia_fit$imputations$imp20$estado_conyugal_2,
       amelia_fit$imputations$imp21$estado_conyugal_2,
       amelia_fit$imputations$imp22$estado_conyugal_2,
       amelia_fit$imputations$imp23$estado_conyugal_2,
       amelia_fit$imputations$imp24$estado_conyugal_2,
       amelia_fit$imputations$imp25$estado_conyugal_2,
       amelia_fit$imputations$imp26$estado_conyugal_2,
       amelia_fit$imputations$imp27$estado_conyugal_2,
       amelia_fit$imputations$imp28$estado_conyugal_2,
       amelia_fit$imputations$imp29$estado_conyugal_2,
       amelia_fit$imputations$imp30$estado_conyugal_2
       ) 

estado_conyugal_2_imputed<-
estado_conyugal_2_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.estado_conyugal_2:amelia_fit.imputations.imp30.estado_conyugal_2),~dplyr::case_when(grepl("Married/Shared living arrangements",as.character(.))~1,TRUE~0), .names="married_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.estado_conyugal_2:amelia_fit.imputations.imp30.estado_conyugal_2),~dplyr::case_when(grepl("Separated/Divorced",as.character(.))~1,TRUE~0), .names="sep_div_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.estado_conyugal_2:amelia_fit.imputations.imp30.estado_conyugal_2),~dplyr::case_when(grepl("Single",as.character(.))~1,TRUE~0), .names="singl_{col}"))%>%
  dplyr::mutate(across(c(amelia_fit.imputations.imp1.estado_conyugal_2:amelia_fit.imputations.imp30.estado_conyugal_2),~dplyr::case_when(grepl("Widower",as.character(.))~1,TRUE~0), .names="widow_{col}"))%>%
 
  dplyr::mutate(estado_conyugal_2_married = base::rowSums(dplyr::select(., starts_with("married_"))))%>%
  dplyr::mutate(estado_conyugal_2_sep_div = base::rowSums(dplyr::select(., starts_with("sep_div_"))))%>%
  dplyr::mutate(estado_conyugal_2_singl = base::rowSums(dplyr::select(., starts_with("singl_"))))%>%
  dplyr::mutate(estado_conyugal_2_wid = base::rowSums(dplyr::select(., starts_with("widow_"))))%>%
  #dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(estado_conyugal_2_tot=dplyr::case_when(estado_conyugal_2_married>0~1,TRUE~0)) %>% 
  dplyr::mutate(estado_conyugal_2_tot=dplyr::case_when(estado_conyugal_2_sep_div>0~estado_conyugal_2_tot+1,TRUE~estado_conyugal_2_tot)) %>% 
  dplyr::mutate(estado_conyugal_2_tot=dplyr::case_when(estado_conyugal_2_singl>0~estado_conyugal_2_tot+1,TRUE~estado_conyugal_2_tot)) %>% 
  dplyr::mutate(estado_conyugal_2_tot=dplyr::case_when(estado_conyugal_2_wid>0~estado_conyugal_2_tot+1,TRUE~estado_conyugal_2_tot)) %>% 
  janitor::clean_names()
  
estado_conyugal_2_imputed_cat_est_cony<-  
    estado_conyugal_2_imputed %>%
        tidyr::pivot_longer(c(estado_conyugal_2_married, estado_conyugal_2_sep_div, estado_conyugal_2_singl, estado_conyugal_2_wid), names_to = "cat_est_conyugal", values_to = "count") %>%
        dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
        dplyr::mutate(estado_conyugal_2_imputed_max=max(count,na.rm=T)) %>% 
        dplyr::ungroup() %>% 
        dplyr::filter(estado_conyugal_2_imputed_max==count) %>% 
        dplyr::select(amelia_fit_imputations_imp1_row,cat_est_conyugal,count) %>% 
        dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
        dplyr::mutate(n_row=n()) %>% 
        dplyr::ungroup() %>% 
        dplyr::mutate(cat_est_conyugal=dplyr::case_when(n_row>1~NA_character_,
                                                        TRUE~cat_est_conyugal)) %>% 
        dplyr::distinct(amelia_fit_imputations_imp1_row,.keep_all = T)
  
estado_conyugal_2_imputed<-
  estado_conyugal_2_imputed %>% 
    dplyr::left_join(estado_conyugal_2_imputed_cat_est_cony, by="amelia_fit_imputations_imp1_row") %>%
    dplyr::mutate(cat_est_conyugal=dplyr::case_when(cat_est_conyugal=="estado_conyugal_2_married"~"Married/Shared living arrangements",cat_est_conyugal=="estado_conyugal_2_sep_div"~"Separated/Divorced",cat_est_conyugal=="estado_conyugal_2_singl"~"Single",cat_est_conyugal=="estado_conyugal_2_wid"~"Widower"
    ))%>% 
  janitor::clean_names()

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss5<-
CONS_C1_df_dup_SEP_2020_match_miss4 %>% 
   dplyr::left_join(dplyr::select(estado_conyugal_2_imputed,amelia_fit_imputations_imp1_row,cat_est_conyugal), by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(estado_conyugal_2=factor(dplyr::case_when(is.na(estado_conyugal_2)~as.character(cat_est_conyugal),TRUE~as.character(estado_conyugal_2)))) %>% 
  data.table()


We could not resolve Marital status in 10 cases due to ties in the most frequent values. We looked over possible imputations to region of the center and type of the center (public or private).


# Ver distintos valores propuestos para estado conyugal
#evaluacindelprocesoteraputico_min nombre_region tipo_centro_pub

#no hay información. debemos imputar
no_mostrar=0
if (no_mostrar==1){
tipo_centro_nombre_region_nas_nombre_region<-
CONS_C1_df_dup_SEP_2020 %>% 
    #dplyr::filter(row %in% unlist(unique(CONS_C1_df_dup_SEP_2020_match[,"row"]))) %>% 
    dplyr::filter(is.na(nombre_region)) %>% 
    janitor::tabyl(tipo_centro, nombre_region) 
}

nombre_region_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$nombre_region,
       amelia_fit$imputations$imp2$nombre_region,
       amelia_fit$imputations$imp3$nombre_region,
       amelia_fit$imputations$imp4$nombre_region,
       amelia_fit$imputations$imp5$nombre_region,
       amelia_fit$imputations$imp6$nombre_region,
       amelia_fit$imputations$imp7$nombre_region,
       amelia_fit$imputations$imp8$nombre_region,
       amelia_fit$imputations$imp9$nombre_region,
       amelia_fit$imputations$imp10$nombre_region,
       amelia_fit$imputations$imp11$nombre_region,
       amelia_fit$imputations$imp12$nombre_region,
       amelia_fit$imputations$imp13$nombre_region,
       amelia_fit$imputations$imp14$nombre_region,
       amelia_fit$imputations$imp15$nombre_region,
       amelia_fit$imputations$imp16$nombre_region,
       amelia_fit$imputations$imp17$nombre_region,
       amelia_fit$imputations$imp18$nombre_region,
       amelia_fit$imputations$imp19$nombre_region,
       amelia_fit$imputations$imp20$nombre_region,
       amelia_fit$imputations$imp21$nombre_region,
       amelia_fit$imputations$imp22$nombre_region,
       amelia_fit$imputations$imp23$nombre_region,
       amelia_fit$imputations$imp24$nombre_region,
       amelia_fit$imputations$imp25$nombre_region,
       amelia_fit$imputations$imp26$nombre_region,
       amelia_fit$imputations$imp27$nombre_region,
       amelia_fit$imputations$imp28$nombre_region,
       amelia_fit$imputations$imp29$nombre_region,
       amelia_fit$imputations$imp30$nombre_region
       ) 
nombre_region_imputed<-
nombre_region_imputed %>% 
  data.frame() %>% 
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Antofagasta",as.character(.))~1,TRUE~0), .names="reg_02_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Araucan",as.character(.))~1,TRUE~0), .names="reg_09_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Arica",as.character(.))~1,TRUE~0), .names="reg_15_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Atacama",as.character(.))~1,TRUE~0), .names="reg_03_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Ays",as.character(.))~1,TRUE~0), .names="reg_11_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Biob",as.character(.))~1,TRUE~0), .names="reg_08_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Coquimbo",as.character(.))~1,TRUE~0), .names="reg_04_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Los Lagos",as.character(.))~1,TRUE~0), .names="reg_10_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Los R",as.character(.))~1,TRUE~0), .names="reg_14_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Magallanes",as.character(.))~1,TRUE~0), .names="reg_12_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Maule",as.character(.))~1,TRUE~0), .names="reg_07_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Metropolitana",as.character(.))~1,TRUE~0), .names="reg_13_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("uble",as.character(.))~1,TRUE~0), .names="reg_16_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Higgins",as.character(.))~1,TRUE~0), .names="reg_06_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Tarapac",as.character(.))~1,TRUE~0), .names="reg_01_{col}"))%>%
dplyr::mutate(across(c(amelia_fit.imputations.imp1.nombre_region:amelia_fit.imputations.imp30.nombre_region),~dplyr::case_when(grepl("Valpara",as.character(.))~1,TRUE~0), .names="reg_05_{col}"))%>%
  
 
  dplyr::mutate(nombre_region_02 = base::rowSums(dplyr::select(., starts_with("reg_02_"))))%>%
  dplyr::mutate(nombre_region_09 = base::rowSums(dplyr::select(., starts_with("reg_09_"))))%>%
  dplyr::mutate(nombre_region_15 = base::rowSums(dplyr::select(., starts_with("reg_15_"))))%>%
  dplyr::mutate(nombre_region_03 = base::rowSums(dplyr::select(., starts_with("reg_03_"))))%>%
  dplyr::mutate(nombre_region_11 = base::rowSums(dplyr::select(., starts_with("reg_11_"))))%>%
  dplyr::mutate(nombre_region_08 = base::rowSums(dplyr::select(., starts_with("reg_08_"))))%>%
  dplyr::mutate(nombre_region_04 = base::rowSums(dplyr::select(., starts_with("reg_04_"))))%>%
  dplyr::mutate(nombre_region_10 = base::rowSums(dplyr::select(., starts_with("reg_10_"))))%>%
  dplyr::mutate(nombre_region_14 = base::rowSums(dplyr::select(., starts_with("reg_14_"))))%>%
  dplyr::mutate(nombre_region_12 = base::rowSums(dplyr::select(., starts_with("reg_12_"))))%>%
  dplyr::mutate(nombre_region_07 = base::rowSums(dplyr::select(., starts_with("reg_07_"))))%>%
  dplyr::mutate(nombre_region_13 = base::rowSums(dplyr::select(., starts_with("reg_13_"))))%>%
  dplyr::mutate(nombre_region_16 = base::rowSums(dplyr::select(., starts_with("reg_16_"))))%>%
  dplyr::mutate(nombre_region_06 = base::rowSums(dplyr::select(., starts_with("reg_06_"))))%>%
  dplyr::mutate(nombre_region_01 = base::rowSums(dplyr::select(., starts_with("reg_01_"))))%>%
  dplyr::mutate(nombre_region_05 = base::rowSums(dplyr::select(., starts_with("reg_05_"))))%>%
  #dplyr::summarise(min_mar=max(sus_ini_mod_mvv_mar[sus_ini_mod_mvv_mar<30]),min_oh=max(sus_ini_mod_mvv_oh[sus_ini_mod_mvv_oh<30]),min_pb=max(sus_ini_mod_mvv_pb[sus_ini_mod_mvv_pb<30]),min_coc=max(sus_ini_mod_mvv_coc[sus_ini_mod_mvv_coc<30]),min_otr=max(sus_ini_mod_mvv_otr[sus_ini_mod_mvv_otr<30]))
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_02>0~1,TRUE~0)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_09>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_15>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_03>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>%
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_11>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_08>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_04>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_10>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_14>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_12>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_07>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_13>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_16>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_06>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_01>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  dplyr::mutate(nombre_region_tot=dplyr::case_when(nombre_region_05>0~nombre_region_tot+1,TRUE~nombre_region_tot)) %>% 
  janitor::clean_names()
  
nombre_region_imputed_cat_reg<-  
    nombre_region_imputed %>%
        tidyr::pivot_longer(c(nombre_region_01, nombre_region_02, nombre_region_03, nombre_region_04, nombre_region_05, nombre_region_06, nombre_region_07, nombre_region_08, nombre_region_09, nombre_region_10, nombre_region_11, nombre_region_12, nombre_region_13, nombre_region_14, nombre_region_15), names_to = "cat_nombre_region", values_to = "count") %>%
        dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
        dplyr::mutate(nombre_region_imputed_max=max(count,na.rm=T)) %>% 
        dplyr::ungroup() %>% 
        dplyr::filter(nombre_region_imputed_max==count) %>% 
        dplyr::select(amelia_fit_imputations_imp1_row,cat_nombre_region,count) %>% 
        dplyr::group_by(amelia_fit_imputations_imp1_row) %>% 
        dplyr::mutate(n_row=n()) %>% 
        dplyr::ungroup() %>% 
        dplyr::mutate(cat_nombre_region=dplyr::case_when(n_row>1~NA_character_,
                                                        TRUE~cat_nombre_region)) %>% 
        dplyr::distinct(amelia_fit_imputations_imp1_row,.keep_all = T)
  
nombre_region_imputed<-
  nombre_region_imputed %>% 
    dplyr::left_join(nombre_region_imputed_cat_reg, by="amelia_fit_imputations_imp1_row") %>%
    dplyr::mutate(cat_nombre_region=dplyr::case_when(cat_nombre_region=="nombre_region_01"~"Tarapacá (01)",cat_nombre_region=="nombre_region_02"~"Antofagasta (02)",cat_nombre_region=="nombre_region_03"~"Atacama (03)",cat_nombre_region=="nombre_region_04"~"Coquimbo (04)",cat_nombre_region=="nombre_region_05"~"Valparaíso (05)",cat_nombre_region=="nombre_region_06"~"O'Higgins (06)",cat_nombre_region=="nombre_region_07"~"Maule (07)",cat_nombre_region=="nombre_region_08"~"Biobío (08)",cat_nombre_region=="nombre_region_09"~"Araucanía (09)",cat_nombre_region=="nombre_region_10"~"Los Lagos (10)",cat_nombre_region=="nombre_region_11"~"Aysén (11)",cat_nombre_region=="nombre_region_12"~"Magallanes (12)",cat_nombre_region=="nombre_region_13"~"Metropolitana (13)",
                                                 cat_nombre_region=="nombre_region_14"~"Los Ríos (14)",cat_nombre_region=="nombre_region_15"~"Arica (15)",cat_nombre_region=="nombre_region_16"~"Ñuble (16)",
    ))%>% 
  janitor::clean_names()

#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_
tipo_centro_pub_imputed<-
 cbind.data.frame(amelia_fit$imputations$imp1$row,
       amelia_fit$imputations$imp1$tipo_centro_pub,
       amelia_fit$imputations$imp2$tipo_centro_pub,
       amelia_fit$imputations$imp3$tipo_centro_pub,
       amelia_fit$imputations$imp4$tipo_centro_pub,
       amelia_fit$imputations$imp5$tipo_centro_pub,
       amelia_fit$imputations$imp6$tipo_centro_pub,
       amelia_fit$imputations$imp7$tipo_centro_pub,
       amelia_fit$imputations$imp8$tipo_centro_pub,
       amelia_fit$imputations$imp9$tipo_centro_pub,
       amelia_fit$imputations$imp10$tipo_centro_pub,
       amelia_fit$imputations$imp11$tipo_centro_pub,
       amelia_fit$imputations$imp12$tipo_centro_pub,
       amelia_fit$imputations$imp13$tipo_centro_pub,
       amelia_fit$imputations$imp14$tipo_centro_pub,
       amelia_fit$imputations$imp15$tipo_centro_pub,
       amelia_fit$imputations$imp16$tipo_centro_pub,
       amelia_fit$imputations$imp17$tipo_centro_pub,
       amelia_fit$imputations$imp18$tipo_centro_pub,
       amelia_fit$imputations$imp19$tipo_centro_pub,
       amelia_fit$imputations$imp20$tipo_centro_pub,
       amelia_fit$imputations$imp21$tipo_centro_pub,
       amelia_fit$imputations$imp22$tipo_centro_pub,
       amelia_fit$imputations$imp23$tipo_centro_pub,
       amelia_fit$imputations$imp24$tipo_centro_pub,
       amelia_fit$imputations$imp25$tipo_centro_pub,
       amelia_fit$imputations$imp26$tipo_centro_pub,
       amelia_fit$imputations$imp27$tipo_centro_pub,
       amelia_fit$imputations$imp28$tipo_centro_pub,
       amelia_fit$imputations$imp29$tipo_centro_pub,
       amelia_fit$imputations$imp30$tipo_centro_pub
       ) 

tipo_centro_pub_imputed<-
  tipo_centro_pub_imputed %>% 
  dplyr::mutate(tipo_centro_pub = base::rowSums(dplyr::select(., ends_with("tipo_centro_pub"))))%>%
  dplyr::mutate(tipo_centro_pub_to_imputation= dplyr::case_when(tipo_centro_pub>15~1,
                                                  tipo_centro_pub==15~NA_real_,
                                                  TRUE~0)) %>% 
  janitor::clean_names()

#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:#:#:#::#:#:#:

CONS_C1_df_dup_SEP_2020_match_miss6<-
CONS_C1_df_dup_SEP_2020_match_miss5 %>% 
   dplyr::left_join(dplyr::select(nombre_region_imputed,amelia_fit_imputations_imp1_row,cat_nombre_region), by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
    dplyr::mutate(nombre_region=factor(dplyr::case_when(is.na(nombre_region)~as.character(cat_nombre_region),TRUE~as.character(nombre_region)))) %>% 
  dplyr::left_join(dplyr::select(tipo_centro_pub_imputed,amelia_fit_imputations_imp1_row,tipo_centro_pub_to_imputation), by=c("row"="amelia_fit_imputations_imp1_row")) %>% 
  dplyr::mutate(tipo_centro_pub=factor(dplyr::case_when(is.na(tipo_centro_pub)~as.logical(tipo_centro_pub_to_imputation),TRUE~as.logical(tipo_centro_pub)))) %>%
  data.table()
#CONS_C1_df_dup_SEP_2020_match_miss6
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss6$tipo_centro_pub))
#table(is.na(CONS_C1_df_dup_SEP_2020_match_miss6$nombre_region))


There were impossible to impute region of the center in 7 cases due to ties in the different imputed values.


Sample Characteristics

We checked the characteristics of the sample depending on type of treatment (Residential or Ambulatory).


#prop.table(table(CONS_C1_df_dup_SEP_2020_match$abandono_temprano_rec,CONS_C1_df_dup_SEP_2020_match$tipo_de_plan_res),2)
match.on.sel<-c("sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","estatus_ocupacional_rec","edad_ini_cons","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","nombre_region","tipo_centro_pub","prev_treat","sexo_2","edad_al_ing","fech_ing_num")

catVars<-
c("sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","estatus_ocupacional_rec","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","nombre_region","tipo_centro_pub","tipo_de_plan_res","prev_treat","sexo_2")

#añado los imputados
CONS_C1_df_dup_SEP_2020_match_not_miss<-
CONS_C1_df_dup_SEP_2020_match %>% 
  dplyr::select(-sus_ini_mod_mvv,-estado_conyugal_2,-escolaridad_rec,-freq_cons_sus_prin,-edad_ini_cons,-nombre_region,-tipo_centro_pub) %>% 
  dplyr::left_join(dplyr::select(CONS_C1_df_dup_SEP_2020_match_miss6,
                                 row,
                                 sus_ini_mod_mvv,
                                 estado_conyugal_2,
                                 escolaridad_rec,
                                 freq_cons_sus_prin,
                                 nombre_region,
                                 tipo_centro_pub,
                                 edad_ini_cons),by="row") %>% 
  dplyr::arrange(tipo_de_plan_res,hash_key,rn)
CONS_C1_df_dup_SEP_2020_match_not_miss2<-
  CONS_C1_df_dup_SEP_2020_match_not_miss[complete.cases(CONS_C1_df_dup_SEP_2020_match_not_miss[,..match.on_tot]),..match.on_tot] 


Considering that some missing values were not possible to impute, we started having nrow(CONS_C1_df_dup_SEP_2020_match_not_miss) %>% formatC(big.mark=",") cases (users=79,527), and ended having nrow(CONS_C1_df_dup_SEP_2020_match_not_miss2) %>% formatC(big.mark=",") complete cases (79,492).


kableone <- function(x, ...) {
  capture.output(x <- print(x,...))
  knitr::kable(x,format= "html", format.args= list(decimal.mark= ".", big.mark= ","))
}
#table(is.na(CONS_C1_df_dup_SEP_2020_match_not_miss$edad_ini_cons))

#length(unique(CONS_C1_df_dup_SEP_2020_match$fech_ing_num))
#:#:#:#:#: DISMINUIR LA HETEROGENEIDAD DE LA FECHA DE INGRESO
# FORMAS DE CONSTREÑIR LA VARIABLE:
#CONS_C1_df_dup_SEP_2020_match$fech_ing_num<-round(CONS_C1_df_dup_SEP_2020_match$fech_ing_num/10,0)
#CONS_C1_df_dup_SEP_2020_match$fech_ing_num<-cut(CONS_C1_df_dup_SEP_2020_match$fech_ing_num,100)
#CONS_C1_df_dup_SEP_2020_match$fech_ing_num<-CONS_C1_df_dup_SEP_2020_match_fech_ing_num
#CONS_C1_df_dup_SEP_2020_match_fech_ing_num<-CONS_C1_df_dup_SEP_2020_match$fech_ing_num
#length(unique(round(CONS_C1_df_dup_SEP_2020_match$fech_ing_num,0)))
#length(unique(round(CONS_C1_df_dup_SEP_2020_match$fech_ing_num/10,0)))

#CONS_C1_df_dup_SEP_2020_match$fech_ing_num<-round(CONS_C1_df_dup_SEP_2020_match$fech_ing_num/10,0)
#:#:#:#:#: 

label(CONS_C1_df_dup_SEP_2020_match_not_miss2$sus_ini_mod_mvv)<-"Starting Substance"
label(CONS_C1_df_dup_SEP_2020_match_not_miss2$estado_conyugal_2)<-"Marital Status"
label(CONS_C1_df_dup_SEP_2020_match_not_miss2$escolaridad_rec)<-"Educational Attainment"
label(CONS_C1_df_dup_SEP_2020_match_not_miss2$edad_ini_cons)<-"Age of Onset of Drug Use"
label(CONS_C1_df_dup_SEP_2020_match_not_miss2$freq_cons_sus_prin)<-"Frequency of use of primary drug"
label(CONS_C1_df_dup_SEP_2020_match_not_miss2$nombre_region)<-"Frequency of use of primary drug"
label(CONS_C1_df_dup_SEP_2020_match_not_miss2$tipo_centro_pub)<-"Type of Center (Public)"

pre_tab1<-Sys.time()
tab1<-
CreateTableOne(vars = match.on.sel, strata = "tipo_de_plan_res", 
                       data = CONS_C1_df_dup_SEP_2020_match_not_miss2, factorVars = catVars, smd=T)
post_tab1<-Sys.time()
diff_time_tab1=post_tab1-pre_tab1

kableone(tab1, nonnormal= c("edad_ini_cons","edad_al_ing","fech_ing_num"),#"\\hline",
                       smd=T, test=T, varLabels=T,noSpaces=T, printToggle=T, dropEqual=F) %>% 
    kableExtra::kable_styling(bootstrap_options = c("striped", "hover","condensed"),font_size= 10) %>%
  #()
  row_spec(1, bold = T, italic =T,color ="black",hline_after=T,extra_latex_after="\\arrayrulecolor{white}",font_size= 10) %>%
  #footnote(general = "Here is a general comments of the table. ",
  #        number = c("Footnote 1; ", "Footnote 2; "),
  #         alphabet = c("Footnote A; ", "Footnote B; "),
  #         symbol = c("Footnote Symbol 1; ", "Footnote Symbol 2")
  #         )%>%
  scroll_box(width = "100%", height = "400px") 
FALSE TRUE p test SMD
n 83187 18634
Starting Substance (%) <0.001 0.338
Marijuana 22141 (26.6) 6908 (37.1)
Other 1906 (2.3) 451 (2.4)
Alcohol 46859 (56.3) 7475 (40.1)
Cocaine hydrochloride 3444 (4.1) 809 (4.3)
Cocaine paste 8837 (10.6) 2991 (16.1)
Marital Status (%) <0.001 0.316
Married/Shared living arrangements 29780 (35.8) 4182 (22.4)
Separated/Divorced 9153 (11.0) 1886 (10.1)
Single 43285 (52.0) 12383 (66.5)
Widower 969 (1.2) 183 (1.0)
Educational Attainment (%) <0.001 0.056
3-Completed primary school or less 23077 (27.7) 5335 (28.6)
1-More than high school 14095 (16.9) 2774 (14.9)
2-Completed high school or less 46015 (55.3) 10525 (56.5)
Occupational Status (%) <0.001 0.854
Unemployed 25826 (31.0) 10629 (57.0)
Employed 44565 (53.6) 3008 (16.1)
Inactive 12796 (15.4) 4997 (26.8)
Age of Onset of Drug Use (median [IQR]) 15.00 [14.00, 18.00] 15.00 [13.00, 17.00] <0.001 nonnorm 0.081
Frequency of use of primary drug (%) <0.001 0.794
1 day a week or more 6082 (7.3) 373 (2.0)
2 to 3 days a week 25031 (30.1) 1790 (9.6)
4 to 6 days a week 13937 (16.8) 2316 (12.4)
Daily 33173 (39.9) 13862 (74.4)
Did not use 1547 (1.9) 120 (0.6)
Less than 1 day a week 3417 (4.1) 173 (0.9)
Motive of Admission to Treatment (%) <0.001 0.537
Spontaneous 40179 (48.3) 6427 (34.5)
Assisted Referral 6250 (7.5) 4867 (26.1)
Other 4212 (5.1) 1011 (5.4)
Justice Sector 7848 (9.4) 1100 (5.9)
Health Sector 24698 (29.7) 5229 (28.1)
Psychiatric comorbidity (%) <0.001 0.326
Without psychiatric comorbidity 32466 (39.0) 4493 (24.1)
Diagnosis unknown (under study) 15085 (18.1) 4085 (21.9)
With psychiatric comorbidity 35636 (42.8) 10056 (54.0)
Frequency of use of primary drug (%) <0.001 0.380
Antofagasta (02) 2703 (3.2) 990 (5.3)
Araucanía (09) 2416 (2.9) 217 (1.2)
Arica (15) 1651 (2.0) 1153 (6.2)
Atacama (03) 2133 (2.6) 419 (2.2)
Aysén (11) 837 (1.0) 51 (0.3)
Biobío (08) 5717 (6.9) 1000 (5.4)
Coquimbo (04) 3324 (4.0) 431 (2.3)
Los Lagos (10) 2894 (3.5) 472 (2.5)
Los Ríos (14) 1248 (1.5) 278 (1.5)
Magallanes (12) 1066 (1.3) 66 (0.4)
Maule (07) 4842 (5.8) 1009 (5.4)
Metropolitana (13) 41746 (50.2) 9141 (49.1)
Ñuble (16) 568 (0.7) 20 (0.1)
O’Higgins (06) 4156 (5.0) 839 (4.5)
Tarapacá (01) 1579 (1.9) 895 (4.8)
Valparaíso (05) 6307 (7.6) 1653 (8.9)
Type of Center (Public) = TRUE (%) 65267 (78.5) 5134 (27.6) <0.001 1.186
No. of Previous Treatments (%) <0.001 0.360
0 67267 (80.9) 12214 (65.5)
1 11873 (14.3) 4318 (23.2)
2 or more 4047 (4.9) 2102 (11.3)
Sex = Women (%) 20932 (25.2) 6062 (32.5) <0.001 0.163
Age at Admission (median [IQR]) 34.45 [27.82, 43.04] 32.84 [26.89, 40.54] <0.001 nonnorm 0.172
Date of Admission to Treatment (Numeric) (median [IQR]) 16549.00 [15764.00, 17259.00] 16316.00 [15530.00, 17101.00] <0.001 nonnorm 0.153
#"tipo_de_plan_ambulatorio",
#https://cran.r-project.org/web/packages/tableone/vignettes/smd.html
#http://rstudio-pubs-static.s3.amazonaws.com/405765_2ce448f9bde24148a5f94c535a34b70e.html
#https://cran.r-project.org/web/packages/tableone/vignettes/introduction.html
#https://cran.r-project.org/web/packages/tableone/tableone.pdf
#https://www.rdocumentation.org/packages/tableone/versions/0.12.0/topics/CreateTableOne

## Construct a table 
#standardized mean differences of greater than 0.1


We checked the similarity in the samples using other measures, such as the variance ratio of the samples and Kolmogorov-Smirnov(KS) statistics.


library(cobalt)
bal2<-bal.tab(CONS_C1_df_dup_SEP_2020_match_not_miss2[,..match.on.sel], treat = CONS_C1_df_dup_SEP_2020_match_not_miss2$tipo_de_plan_res,
         thresholds = c(m = .1, v = 2),
         binary = "std", continuous = "std",
         stats = c("mean.diffs", "variance.ratios","ks.statistics"))
Note: 's.d.denom' not specified; assuming pooled.
#"mean.diffs", "variance.ratios","ks.statistics","ovl.coefficient"

options(knitr.kable.NA = '')

bal2$Balance[,2]<-round(bal2$Balance[,2],2)
bal2$Balance[,4]<-round(bal2$Balance[,4],2)

bal2$Balance[,1:6] %>% 
    knitr::kable(.,format = "html", format.args = list(decimal.mark = ".", big.mark = ","),
               caption = paste0("Table 2. Covariate Balance in the Variables of Interest"),
               col.names = c("Nature of Variables", "Unadjusted SMDs","Threshold","Unadjusted Variance Ratios","Threshold","Unadjusted KS"),
               align =rep('c', 101)) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 10) %>%
  kableExtra::add_footnote( c(paste("Note. ")), 
                            notation = "none") %>%
  kableExtra::scroll_box(width = "100%", height = "375px")
Table 2. Covariate Balance in the Variables of Interest
Nature of Variables Unadjusted SMDs Threshold Unadjusted Variance Ratios Threshold Unadjusted KS
sus_ini_mod_mvv_Marijuana Binary 0.23 Not Balanced, >0.1 0.1045608
sus_ini_mod_mvv_Other Binary 0.01 Balanced, <0.1 0.0012908
sus_ini_mod_mvv_Alcohol Binary -0.33 Not Balanced, >0.1 0.1621487
sus_ini_mod_mvv_Cocaine hydrochloride Binary 0.01 Balanced, <0.1 0.0020146
sus_ini_mod_mvv_Cocaine paste Binary 0.16 Not Balanced, >0.1 0.0542825
estado_conyugal_2_Married/Shared living arrangements Binary -0.30 Not Balanced, >0.1 0.1335602
estado_conyugal_2_Separated/Divorced Binary -0.03 Balanced, <0.1 0.0088164
estado_conyugal_2_Single Binary 0.30 Not Balanced, >0.1 0.1442042
estado_conyugal_2_Widower Binary -0.02 Balanced, <0.1 0.0018277
escolaridad_rec_3-Completed primary school or less Binary 0.02 Balanced, <0.1 0.0088935
escolaridad_rec_1-More than high school Binary -0.06 Balanced, <0.1 0.0205699
escolaridad_rec_2-Completed high school or less Binary 0.02 Balanced, <0.1 0.0116764
estatus_ocupacional_rec_Unemployed Binary 0.54 Not Balanced, >0.1 0.2599518
estatus_ocupacional_rec_Employed Binary -0.85 Not Balanced, >0.1 0.3742954
estatus_ocupacional_rec_Inactive Binary 0.28 Not Balanced, >0.1 0.1143436
edad_ini_cons Contin. -0.08 Balanced, <0.1 0.92 Balanced, <2 0.0640874
freq_cons_sus_prin_1 day a week or more Binary -0.25 Not Balanced, >0.1 0.0530952
freq_cons_sus_prin_2 to 3 days a week Binary -0.53 Not Balanced, >0.1 0.2048394
freq_cons_sus_prin_4 to 6 days a week Binary -0.12 Not Balanced, >0.1 0.0432493
freq_cons_sus_prin_Daily Binary 0.74 Not Balanced, >0.1 0.3451327
freq_cons_sus_prin_Did not use Binary -0.11 Not Balanced, >0.1 0.0121568
freq_cons_sus_prin_Less than 1 day a week Binary -0.20 Not Balanced, >0.1 0.0317920
origen_ingreso_mod_Spontaneous Binary -0.28 Not Balanced, >0.1 0.1380890
origen_ingreso_mod_Assisted Referral Binary 0.51 Not Balanced, >0.1 0.1860573
origen_ingreso_mod_Other Binary 0.02 Balanced, <0.1 0.0036228
origen_ingreso_mod_Justice Sector Binary -0.13 Not Balanced, >0.1 0.0353098
origen_ingreso_mod_Health Sector Binary -0.04 Balanced, <0.1 0.0162813
dg_cie_10_rec_Without psychiatric comorbidity Binary -0.33 Not Balanced, >0.1 0.1491589
dg_cie_10_rec_Diagnosis unknown (under study) Binary 0.09 Balanced, <0.1 0.0378845
dg_cie_10_rec_With psychiatric comorbidity Binary 0.22 Not Balanced, >0.1 0.1112744
nombre_region_Antofagasta (02) Binary 0.10 Not Balanced, >0.1 0.0206356
nombre_region_Araucanía (09) Binary -0.12 Not Balanced, >0.1 0.0173976
nombre_region_Arica (15) Binary 0.21 Not Balanced, >0.1 0.0420293
nombre_region_Atacama (03) Binary -0.02 Balanced, <0.1 0.0031552
nombre_region_Aysén (11) Binary -0.09 Balanced, <0.1 0.0073247
nombre_region_Biobío (08) Binary -0.06 Balanced, <0.1 0.0150593
nombre_region_Coquimbo (04) Binary -0.10 Balanced, <0.1 0.0168284
nombre_region_Los Lagos (10) Binary -0.06 Balanced, <0.1 0.0094590
nombre_region_Los Ríos (14) Binary 0.00 Balanced, <0.1 0.0000834
nombre_region_Magallanes (12) Binary -0.10 Not Balanced, >0.1 0.0092726
nombre_region_Maule (07) Binary -0.02 Balanced, <0.1 0.0040579
nombre_region_Metropolitana (13) Binary -0.02 Balanced, <0.1 0.0112783
nombre_region_Ñuble (16) Binary -0.09 Balanced, <0.1 0.0057547
nombre_region_O’Higgins (06) Binary -0.02 Balanced, <0.1 0.0049345
nombre_region_Tarapacá (01) Binary 0.16 Not Balanced, >0.1 0.0290492
nombre_region_Valparaíso (05) Binary 0.05 Balanced, <0.1 0.0128917
tipo_centro_pub Binary -1.19 Not Balanced, >0.1 0.5090639
prev_treat_0 Binary -0.35 Not Balanced, >0.1 0.1531554
prev_treat_1 Binary 0.23 Not Balanced, >0.1 0.0890003
prev_treat_2 or more Binary 0.24 Not Balanced, >0.1 0.0641551
sexo_2_Women Binary 0.16 Not Balanced, >0.1 0.0736935
edad_al_ing Contin. -0.17 Not Balanced, >0.1 0.83 Balanced, <2 0.0650115
fech_ing_num Contin. -0.15 Not Balanced, >0.1 1.02 Balanced, <2 0.0837954
Note.


We generated a plot to focus on unbalanced data.


love.plot(bal2, binary = "std", thresholds = c(m = .1, m=.2))+
  theme_bw()+
  ggtitle(NULL)+
  labs(caption="Note. Vertical dashed line= Standardized Differences of .1")+
  theme( plot.caption=element_text(hjust=0))
Figure 5. Covariates Balance on Different Values

Figure 5. Covariates Balance on Different Values

#ggplot(idh_pre, aes(idh_pre$quintil_0,idh_pre$idh_0)) + 
#  geom_point(aes(colour = idh_pre$groups), size = 5) + labs(x="Quintiles", y="Percentage")  + 
#  theme_bw() +  
#  scale_fill_manual(values=c("black", "gray60") ) + 
#  scale_colour_manual(name="Groups",values =c("Control"="black", "Exposed"="gray60")) + 
#  theme(legend.key = element_rect(colour = NA)) + 
#  theme(text = element_text(size=20)) + 
#  scale_y_continuous( limits = c(0,0.4), breaks = seq(.05,.4,by=.05))

#bal.tab(bal2, treat = "tipo_de_plan_res",var.name = "fech_ing_num", data = CONS_C1_df_dup_SEP_2020_match_not_miss2)
columna_dummy <- function(df, columna) {
  df %>% 
  mutate_at(columna, ~paste(columna, eval(as.symbol(columna)), sep = "_")) %>% 
    mutate(valor = 1) %>% 
    spread(key = columna, value = valor, fill = 0)
}

match.on<-c("sus_ini_mod_mvv","estado_conyugal_2","escolaridad_rec","estatus_ocupacional_rec","edad_ini_cons","freq_cons_sus_prin","origen_ingreso_mod","dg_cie_10_rec","nombre_region","tipo_centro_pub","abandono_temprano_rec","tipo_de_plan_res","evaluacindelprocesoteraputico_min","prev_treat","sexo_2","edad_al_ing")

#nrow(CONS_C1_df_dup_SEP_2020_match[complete.cases(CONS_C1_df_dup_SEP_2020_match[,match.on_tot]),match.on_tot]) #101,384

for (i in c(1:length(catVars[-10]))){
  cat<-as.character(catVars[-10][i])
  CONS_C1_df_dup_SEP_2020_match2<-columna_dummy(CONS_C1_df_dup_SEP_2020_match2,cat)
}
match.on.sel2<-names(CONS_C1_df_dup_SEP_2020_match2)[-c(1,2,5)]

#dim(CONS_C1_df_dup_SEP_2020_match[complete.cases(CONS_C1_df_dup_SEP_2020_match[,..match.on_tot]),..match.on_tot]) #94,097
library(MatchingFrontier)
mahal.frontier <- makeFrontier(dataset =CONS_C1_df_dup_SEP_2020_match2,
 treatment = 'tipo_de_plan_res',
 #outcome = 'diff_bet_treat',
 match.on = match.on.sel2)

L1.frontier <- makeFrontier(dataset =CONS_C1_df_dup_SEP_2020_match2,
 treatment = 'tipo_de_plan_res',
 #outcome = 'abandono_temprano_rec',
 match.on = match.on.sel2,
 QOI = 'SATT',
 metric = 'L1',
 ratio = 'fixed')
 
 #Error: In makeFrontier(), missing values in the data; remove them (or impute) and try again.

# By default, makeFrontier() calculates the frontier with the Average Mahalanobis Imbalance. However, as we demonstrate, MatchingFrontier works just as easily with L1 difference.
#The default quantity of interest is the feasible sample average treatment effect on the treated or FSATT

Specification

First, we had to discretize categorical variables into logical parameters, and for countinous covariates, we created quantiles


columna_dummy <- function(df, columna) {
  df %>% 
  mutate_at(columna, ~paste(columna, eval(as.symbol(columna)), sep = "_")) %>% 
    mutate(valor = 1) %>% 
    spread(key = columna, value = valor, fill = 0)
}

quantiles = function(covar, n_q) {
    p_q = seq(0, 1, 1/n_q)
    val_q = quantile(covar, probs = p_q, na.rm = TRUE)
    covar_out = rep(NA, length(covar))
    for (i in 1:n_q) {
        if (i==1) {covar_out[covar<val_q[i+1]] = i}
        if (i>1 & i<n_q) {covar_out[covar>=val_q[i] & covar<val_q[i+1]] = i}
        if (i==n_q) {covar_out[covar>=val_q[i] & covar<=val_q[i+1]] = i}}
    covar_out
}

CONS_C1_df_dup_SEP_2020_match_not_miss3<-CONS_C1_df_dup_SEP_2020_match_not_miss2
for (i in c(1:length(catVars))){#catVars[-10] excluding treatment indicator
  cat<-as.character(catVars[i])#catVars[-10] excluding treatment indicator
  CONS_C1_df_dup_SEP_2020_match_not_miss3<-columna_dummy(CONS_C1_df_dup_SEP_2020_match_not_miss3,cat)
}
CONS_C1_df_dup_SEP_2020_match_not_miss3$tipo_de_plan_res_FALSE<-NULL
CONS_C1_df_dup_SEP_2020_match_not_miss3$edad_ini_cons<-quantiles(CONS_C1_df_dup_SEP_2020_match_not_miss3$edad_ini_cons,20)
CONS_C1_df_dup_SEP_2020_match_not_miss3$edad_al_ing<-quantiles(CONS_C1_df_dup_SEP_2020_match_not_miss3$edad_al_ing,20)
CONS_C1_df_dup_SEP_2020_match_not_miss3$fech_ing_num<-quantiles(CONS_C1_df_dup_SEP_2020_match_not_miss3$fech_ing_num,20)
match.on.sel2<-names(CONS_C1_df_dup_SEP_2020_match_not_miss3)[-c(1,2,5)]
#"edad_ini_cons","edad_al_ing","fech_ing_num")

CONS_SEP_match = CONS_C1_df_dup_SEP_2020_match_not_miss2[order(CONS_C1_df_dup_SEP_2020_match_not_miss2$tipo_de_plan_res, decreasing = TRUE), ]


Match

library(designmatch)

#fine = list(covs = fine_covs)
#solver = list(name = name, t_max = t_max, approximate = 1, round_cplex = 0, trace_cplex = 0).
#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:
# 1. Gurobi installation

#For an exact solution, we strongly recommend running designmatch either with CPLEX or Gurobi.  Between these two solvers, the R interface of Gurobi is considerably easier to install.  Here we provide general instructions for manually installing Gurobi and its R interface in Mac and Windows machines.

#1. Create a free academic license
#   Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/creating_a_new_academic_li.html

#2. Install the software
#   2.1. In http://www.gurobi.com/index, go to Downloads > Gurobi Software
#   2.2. Choose your operating system and press download
#
#3. Retrieve and set up your Gurobi license
#   2.1. Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/retrieving_and_setting_up_.html
#   2.2. Then follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/retrieving_a_free_academic.html
#
#4. Test your license
#   Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/testing_your_license.html
#
#5. Install the R interface of Gurobi   
#   Follow the instructions in: http://www.gurobi.com/documentation/7.0/quickstart_windows/r_installing_the_r_package.html
#   * In Windows, in R run the command install.packages("PATH\\gurobi_7.X-Y.zip", repos=NULL) where path leads to the file gurobi_7.X-Y.zip (for example PATH=C:\\gurobi702\\win64\\R; note that the path may be different in your computer), and "7.X-Y" refers to the version you are installing.
#   * In MAC, in R run the command install.packages('PATH/gurobi_7.X-Y.tgz', repos=NULL) where path leads to the file gurobi_7.X-Y.tgz (for example PATH=/Library/gurobi702/mac64/R; note that the path may be different in your computer), and "7.X-Y" refers to the version you are installing.
#       
#6. Test the installation 
#   Load the library and run the examples therein
#   * A possible error that you may get is the following: "Error: package ‘slam’ required by ‘gurobi’ could not be found". If that case, install.packages('slam') and try again.
#   You should be all set!

#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:#:#:#:#:#:#:#:#:#:#:#:#:#:#:##:
require(slam)
# Solver options
#default solver is glpk with approximate = 1
#For an exact solution, we strongly recommend using cplex or gurobi as they are much faster than the other solvers, but they do require a license (free for academics, but not for people outside universities)
t_max = 60*6
solver = "gurobi" #cplex, glpk, gurobi and symphony
solver = list(name = solver, 
t_max = t_max, #t_max is a scalar with the maximum time limit for finding the matches.within this time limit, a partial, suboptimal solution is given
approximate = 1,#. If approximate = 1 (the default), an approximate solution is found via a relaxation of the original integer program
round_cplex = 0, 
trace = 1)#turns the optimizer output on

#Indicador de tratamiento
t_ind= CONS_C1_df_dup_SEP_2020_match_not_miss2$tipo_de_plan_res

# Moment balance: constrain differences in means to be at most 0.1 standard deviations apart
#:#:#:#:#:#:#:#:#:#:#:#:#:
#######mom_covs is a matrix where each column is a covariate whose mean is to be balanced
#######mom_tols is a vector of tolerances for the maximum difference in means for the covariates in mom_covs
#######mom_targets is a vector of target moments (e.g., means) of a distribution to be approximated by matched sampling. is optional, but if #######mom_covs is specified then mom_tols needs to be specified too
#######The lengths of mom_tols and mom_target have to be equal to the number of columns of mom_covs
mom_covs = cbind(CONS_SEP_match$edad_al_ing,CONS_SEP_match$fech_ing_num,CONS_SEP_match$edad_ini_cons)
mom_tols = absstddif(mom_covs, t_ind, .05)
mom = list(covs = mom_covs, tols = mom_tols, targets = NULL)

# Mean balance
covs = cbind(CONS_SEP_match$edad_al_ing,CONS_SEP_match$fech_ing_num,CONS_SEP_match$edad_ini_cons)
meantab(covs, t_ind)
     Mis      Min      Max   Mean T   Mean C Std Dif P-val
[1,]   0    14.88    88.84    35.88    35.88       0     1
[2,]   0 13621.00 18193.00 16421.78 16421.78       0     1
[3,]   0    -3.80    74.00    16.32    16.32       0     1
# Fine balance
#is a matrix where each column is a nominal covariate for fine balance
fine_covs = cbind(CONS_SEP_match$nombre_region, CONS_SEP_match$sus_ini_mod_mvv, CONS_SEP_match$estado_conyugal_2, CONS_SEP_match$escolaridad_rec, CONS_SEP_match$estatus_ocupacional_rec, CONS_SEP_match$freq_cons_sus_prin, CONS_SEP_match$origen_ingreso_mod, CONS_SEP_match$dg_cie_10_rec, CONS_SEP_match$prev_treat, CONS_SEP_match$sexo_2)
fine = list(covs = fine_covs)

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#MATCH
start.time <- Sys.time()
set.seed(2125)
out = cardmatch(t_ind, #ES NECESARIO QUE LOS TRATAMIENTOS ESTEN ORDENADOS Y LOS OTROS VECTORES TAMBIËN 
                mom = mom,# ya los definí list(covs = mom_covs, tols = mom_tols, targets = mom_targets), 
          fine = fine, 
          solver = solver)
  Building the matching problem... 
  Gurobi optimizer is open... 
  Finding the optimal matches... 
Gurobi Optimizer version 9.1.0 build v9.1.0rc0 (win64)
Thread count: 6 physical cores, 12 logical processors, using up to 12 threads
Optimize a model with 56 rows, 101821 columns and 1629136 nonzeros
Model fingerprint: 0xd1515fd2
Variable types: 0 continuous, 101821 integer (101821 binary)
Coefficient statistics:
  Matrix range     [1e-02, 2e+04]
  Objective range  [1e+00, 1e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [0e+00, 0e+00]
Found heuristic solution: objective -0.0000000
Presolve time: 2.02s
Presolved: 56 rows, 101821 columns, 1629048 nonzeros
Variable types: 0 continuous, 101821 integer (101821 binary)

Root relaxation: objective 1.863400e+04, 481 iterations, 1.20 seconds
Total elapsed time = 5.55s

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0 18634.0000    0    8   -0.00000 18634.0000      -     -    8s
H    0     0                    3493.0000000 18634.0000   433%     -   13s
     0     0 18634.0000    0   36 3493.00000 18634.0000   433%     -   13s
H    0     0                    18634.000000 18634.0000  0.00%     -   14s
     0     0 18634.0000    0   36 18634.0000 18634.0000  0.00%     -   14s

Cutting planes:
  Zero half: 1

Explored 1 nodes (9226 simplex iterations) in 14.74 seconds
Thread count was 12 (of 12 available processors)

Solution count 3: 18634 3493 -0 

Optimal solution found (tolerance 1.00e-04)
Best objective 1.863400000000e+04, best bound 1.863400000000e+04, gap 0.0000%
  Optimal matches found 
end.time <- Sys.time()
time.taken <- end.time - start.time
# Fine balance (note here we are getting an approximate solution)
#for (i in 1:ncol(fine_covs)) {     
#   print(finetab(fine_covs[, i], t_id_1, c_id_1))
#}
# Indices of the treated units and matched controls
t_id_1 = out$t_id  
c_id_1 = out$c_id   

paste0("No. of treatments: ",table(table(t_id_1)) %>% formatC(big.mark = ","),"; No. of controls: ",table(table(c_id_1))%>% formatC(big.mark = ","))
[1] "No. of treatments: 4,168; No. of controls: 33,100"
d_match = CONS_SEP_match[c(t_id_1, c_id_1), ]
#cuidado, el anterior me encontró más del mismo control para un tratado
#por eso ocuparé el de más abajo
d_match_no_duplicates = CONS_SEP_match[which(CONS_SEP_match$row %in% c(t_id_1, c_id_1)), ]


vars_ecdf<- c('edad_al_ing', 'edad_ini_cons', 'fech_ing_num')
headings <- c("Age at Admission", "Age of Onset of Drug Use", "Date of Admission")
for (i in 1:length(headings)) {
  cat("### ",headings[i],"\n")
  f<-vars_ecdf[i]
  ecdfplot(as.data.frame(CONS_SEP_match)[,f], t_id_1, c_id_1, main_title = "", legend_position = "right")
  cat('\n\n')
}

Age at Admission

Error in as.data.frame(CONS_SEP_match): objeto 'CONS_SEP_match' no encontrado


X_mat<-cbind(CONS_SEP_match$nombre_region, CONS_SEP_match$sus_ini_mod_mvv, CONS_SEP_match$estado_conyugal_2, CONS_SEP_match$escolaridad_rec, CONS_SEP_match$estatus_ocupacional_rec, CONS_SEP_match$freq_cons_sus_prin, CONS_SEP_match$origen_ingreso_mod, CONS_SEP_match$dg_cie_10_rec, CONS_SEP_match$prev_treat, CONS_SEP_match$sexo_2, CONS_SEP_match$edad_al_ing, CONS_SEP_match$edad_ini_cons, CONS_SEP_match$fech_ing_num)
Error in cbind(CONS_SEP_match$nombre_region, CONS_SEP_match$sus_ini_mod_mvv, : objeto 'CONS_SEP_match' no encontrado
vline = 0.1
loveplot(X_mat=X_mat, t_id=t_id_1, c_id=c_id_1, v_line=vline, legend_position = "topright")
Error in loveplot(X_mat = X_mat, t_id = t_id_1, c_id = c_id_1, v_line = vline, : objeto 'X_mat' no encontrado


t_max = 60*6
solver = "gurobi" #cplex, glpk, gurobi and symphony
solver2 = list(name = solver, 
t_max = t_max, #t_max is a scalar with the maximum time limit for finding the matches.within this time limit, a partial, suboptimal solution is given
approximate = 0,#. If approximate = 1 (the default), an approximate solution is found via a relaxation of the original integer program
round_cplex = 0, 
trace = 1)#turns the optimizer output on

# Moment balance: constrain differences in means to be at most 0.1 standard deviations apart
#:#:#:#:#:#:#:#:#:#:#:#:#:

mom_covs2 = cbind(CONS_SEP_match$edad_al_ing,CONS_SEP_match$fech_ing_num,CONS_SEP_match$edad_ini_cons)
mom_tols2 = absstddif(mom_covs, t_ind, .03)
mom2 = list(covs = mom_covs2, tols = mom_tols2, targets = NULL)

# Mean balance
covs2 = cbind(CONS_SEP_match$edad_al_ing,CONS_SEP_match$fech_ing_num,CONS_SEP_match$edad_ini_cons)
meantab(covs2, t_ind)
     Mis      Min      Max   Mean T   Mean C Std Dif P-val
[1,]   0    14.88    88.84    35.88    35.88       0     1
[2,]   0 13621.00 18193.00 16421.78 16421.78       0     1
[3,]   0    -3.80    74.00    16.32    16.32       0     1
# Fine balance
#is a matrix where each column is a nominal covariate for fine balance
fine_covs2 = cbind(CONS_SEP_match$nombre_region, CONS_SEP_match$sus_ini_mod_mvv, CONS_SEP_match$estado_conyugal_2, CONS_SEP_match$escolaridad_rec, CONS_SEP_match$estatus_ocupacional_rec, CONS_SEP_match$freq_cons_sus_prin, CONS_SEP_match$origen_ingreso_mod, CONS_SEP_match$dg_cie_10_rec, CONS_SEP_match$prev_treat, CONS_SEP_match$sexo_2)
fine2 = list(covs = fine_covs2)

#MATCH
start.time2 <- Sys.time()
set.seed(2125)
out2 = cardmatch(t_ind, #ES NECESARIO QUE LOS TRATAMIENTOS ESTEN ORDENADOS Y LOS OTROS VECTORES TAMBIËN 
                mom = mom,# ya los definí list(covs = mom_covs, tols = mom_tols, targets = mom_targets), 
          fine = fine2, 
          solver = solver2)
  Building the matching problem... 
  Gurobi optimizer is open... 
  Finding the optimal matches... 
Gurobi Optimizer version 9.1.0 build v9.1.0rc0 (win64)
Thread count: 6 physical cores, 12 logical processors, using up to 12 threads
Optimize a model with 56 rows, 101821 columns and 1629136 nonzeros
Model fingerprint: 0xd1515fd2
Variable types: 0 continuous, 101821 integer (101821 binary)
Coefficient statistics:
  Matrix range     [1e-02, 2e+04]
  Objective range  [1e+00, 1e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [0e+00, 0e+00]
Found heuristic solution: objective -0.0000000
Presolve time: 2.00s
Presolved: 56 rows, 101821 columns, 1629048 nonzeros
Variable types: 0 continuous, 101821 integer (101821 binary)

Root relaxation: objective 1.863400e+04, 481 iterations, 1.20 seconds
Total elapsed time = 5.53s

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0 18634.0000    0    8   -0.00000 18634.0000      -     -    8s
H    0     0                    3493.0000000 18634.0000   433%     -   13s
     0     0 18634.0000    0   36 3493.00000 18634.0000   433%     -   13s
H    0     0                    18634.000000 18634.0000  0.00%     -   14s
     0     0 18634.0000    0   36 18634.0000 18634.0000  0.00%     -   14s

Cutting planes:
  Zero half: 1

Explored 1 nodes (9226 simplex iterations) in 14.65 seconds
Thread count was 12 (of 12 available processors)

Solution count 3: 18634 3493 -0 

Optimal solution found (tolerance 1.00e-04)
Best objective 1.863400000000e+04, best bound 1.863400000000e+04, gap 0.0000%
  Optimal matches found 
end.time2 <- Sys.time()
time.taken2 <- end.time2 - start.time2

t_id_2 = out2$t_id  
c_id_2 = out2$c_id  

paste0("No. of treatments: ",table(table(t_id_2)) %>% formatC(big.mark = ","),"; No. of controls: ",table(table(c_id_2))%>% formatC(big.mark = ","))
[1] "No. of treatments: 4,168; No. of controls: 33,100"
d_match2 = CONS_SEP_match[c(t_id_2, c_id_2), ]
d_match2_no_duplicates = CONS_SEP_match[which(CONS_SEP_match$row %in% c(t_id_2, c_id_2)), ]


vars_ecdf<- c('edad_al_ing', 'edad_ini_cons', 'fech_ing_num')
headings <- c("Age at Admission", "Age of Onset of Drug Use", "Date of Admission")
for (i in 1:length(headings)) {
  cat("### ",headings[i],"\n")
  f<-vars_ecdf[i]
  ecdfplot(as.data.frame(CONS_SEP_match)[,f], t_id_2, c_id_2, main_title = "", legend_position = "right")
  cat('\n\n')
}

Age at Admission

Error in as.data.frame(CONS_SEP_match): objeto 'CONS_SEP_match' no encontrado


X_mat<-cbind(CONS_SEP_match$nombre_region, CONS_SEP_match$sus_ini_mod_mvv, CONS_SEP_match$estado_conyugal_2, CONS_SEP_match$escolaridad_rec, CONS_SEP_match$estatus_ocupacional_rec, CONS_SEP_match$freq_cons_sus_prin, CONS_SEP_match$origen_ingreso_mod, CONS_SEP_match$dg_cie_10_rec, CONS_SEP_match$prev_treat, CONS_SEP_match$sexo_2, CONS_SEP_match$edad_al_ing, CONS_SEP_match$edad_ini_cons, CONS_SEP_match$fech_ing_num)
Error in cbind(CONS_SEP_match$nombre_region, CONS_SEP_match$sus_ini_mod_mvv, : objeto 'CONS_SEP_match' no encontrado
vline = 0.1
loveplot(X_mat=X_mat, t_id=t_id_2, c_id=c_id_2, v_line=vline, legend_position = "topright")
Error in loveplot(X_mat = X_mat, t_id = t_id_2, c_id = c_id_2, v_line = vline, : objeto 'X_mat' no encontrado


Moment Balance


CONS_SEP_match2= CONS_C1_df_dup_SEP_2020_match_not_miss3[order(CONS_C1_df_dup_SEP_2020_match_not_miss3$tipo_de_plan_res, decreasing = TRUE), ]

t_max = 60*6
solver = "gurobi" #cplex, glpk, gurobi and symphony
solver3 = list(name = solver, 
t_max = t_max, #t_max is a scalar with the maximum time limit for finding the matches.within this time limit, a partial, suboptimal solution is given
approximate = 0,#. If approximate = 1 (the default), an approximate solution is found via a relaxation of the original integer program
round_cplex = 0, 
trace = 1)#turns the optimizer output on

# Moment balance: constrain differences in means to be at most 0.1 standard deviations apart
#:#:#:#:#:#:#:#:#:#:#:#:#:

mom_covs3 = cbind(CONS_SEP_match2$nombre_region, 
                   CONS_SEP_match2$sus_ini_mod_mvv, 
                   CONS_SEP_match2$estado_conyugal_2, 
                   CONS_SEP_match2$escolaridad_rec, 
                   CONS_SEP_match2$estatus_ocupacional_rec, 
                   CONS_SEP_match2$freq_cons_sus_prin, 
                   CONS_SEP_match2$origen_ingreso_mod, 
                   CONS_SEP_match2$dg_cie_10_rec, 
                   CONS_SEP_match2$prev_treat, 
                   CONS_SEP_match2$sexo_2,
                   CONS_SEP_match2$edad_al_ing, 
                   CONS_SEP_match2$edad_ini_cons, 
                   CONS_SEP_match2$fech_ing_num)
mom_tols3 = absstddif(mom_covs, t_ind, .03)
mom3 = list(covs = mom_covs3, tols = mom_tols3, targets = NULL)

# Mean balance
covs2 = cbind(CONS_SEP_match2$edad_al_ing,CONS_SEP_match2$fech_ing_num,CONS_SEP_match2$edad_ini_cons)
meantab(covs2, t_ind)

# Fine balance
#is a matrix where each column is a nominal covariate for fine balance
fine_covs3 = cbind(CONS_SEP_match2$edad_ini_cons, 
                   CONS_SEP_match2$edad_al_ing, 
                   CONS_SEP_match2$fech_ing_num, 
                   CONS_SEP_match2$sus_ini_mod_mvv_Alcohol, 
                   CONS_SEP_match2$`sus_ini_mod_mvv_Cocaine hydrochloride`, 
                   CONS_SEP_match2$`sus_ini_mod_mvv_Cocaine paste`, 
                   CONS_SEP_match2$`sus_ini_mod_mvv_Marijuana`, 
                   CONS_SEP_match2$sus_ini_mod_mvv_Other, 
                   CONS_SEP_match2$`estado_conyugal_2_Married/Shared living arrangements`, 
                   CONS_SEP_match2$`estado_conyugal_2_Separated/Divorced`,
                   CONS_SEP_match2$estado_conyugal_2_Single, 
                   CONS_SEP_match2$estado_conyugal_2_Widower,
                   CONS_SEP_match2$estado_conyugal_2_Widower,
                   CONS_SEP_match2$estado_conyugal_2_Widower,
                   CONS_SEP_match2$estado_conyugal_2_Widower,
                   CONS_SEP_match2$estado_conyugal_2_Widower,
                   CONS_SEP_match2$fech_ing_num)
fine3 = list(covs = fine_covs3)

#MATCH
start.time3 <- Sys.time()
set.seed(2125)
out3= cardmatch(t_ind, #ES NECESARIO QUE LOS TRATAMIENTOS ESTEN ORDENADOS Y LOS OTROS VECTORES TAMBIËN 
                mom = mom3,# ya los definí list(covs = mom_covs, tols = mom_tols, targets = mom_targets), 
          solver = solver3)
end.time3 <- Sys.time()
time.taken3 <- end.time3 - start.time3

t_id_3 = out2$t_id  
c_id_3 = out2$c_id  

paste0("No. of treatments: ",table(table(t_id_3)) %>% formatC(big.mark = ","),"; No. of controls: ",table(table(c_id_3))%>% formatC(big.mark = ","))
d_match3 = CONS_SEP_match2[c(t_id_3, c_id_3), ]
d_match3_no_duplicates = CONS_SEP_match2[which(CONS_SEP_match2$row %in% c(t_id_3, c_id_3)), ]

Inference


Sensitiviy Analyses


Session Info

Sys.getenv("R_LIBS_USER")
[1] "G:/Mi unidad/Alvacast/SISTRAT 2019 (github)/renv/library/R-4.0/x86_64-w64-mingw32;C:/Users/CISS Fondecyt/AppData/Local/Temp/Rtmp6P5XQn/renv-system-library"
rstudioapi::getSourceEditorContext()
Document Context: 
- id:        '37B3EACC'
- path:      'G:/Mi unidad/Alvacast/SISTRAT 2019 (github)/SUD_CL/Matching Process.Rmd'
- contents:  <2098 rows>
Document Selection:
- [1775, 53] -- [1775, 53]: ''
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 18363)

Matrix products: default

locale:
[1] LC_COLLATE=Spanish_Chile.1252  LC_CTYPE=Spanish_Chile.1252   
[3] LC_MONETARY=Spanish_Chile.1252 LC_NUMERIC=C                  
[5] LC_TIME=Spanish_Chile.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] Amelia_1.7.6            Rcpp_1.0.4.6            polycor_0.7-10         
 [4] gurobi_9.1-0            radiant.update_1.4.1    cobalt_4.2.3           
 [7] sensitivityfull_1.5.6   sensitivity2x2xk_1.01   MatchIt_3.0.2          
[10] tableone_0.12.0         stargazer_5.2.2         reshape2_1.4.4         
[13] exactRankTests_0.8-31   gridExtra_2.3           foreign_0.8-80         
[16] glpkAPI_1.3.2           designmatch_0.3.1       Rglpk_0.6-4            
[19] slam_0.1-47             MASS_7.3-51.6           survMisc_0.5.5         
[22] ggfortify_0.4.10        rateratio.test_1.0-2    survminer_0.4.8        
[25] ggpubr_0.3.0            epiR_1.0-15             forcats_0.5.0          
[28] purrr_0.3.4             readr_1.3.1             tibble_3.0.1           
[31] tidyverse_1.3.0         treemapify_2.5.3        ggiraph_0.7.0          
[34] chilemapas_0.2          sf_0.9-3                finalfit_1.0.1         
[37] lsmeans_2.30-0          emmeans_1.4.7           choroplethrAdmin1_1.1.1
[40] choroplethrMaps_1.0.1   choroplethr_3.6.3       acs_2.1.4              
[43] XML_3.99-0.3            RColorBrewer_1.1-2      panelr_0.7.3           
[46] lme4_1.1-23             Matrix_1.2-18           dplyr_1.0.0            
[49] data.table_1.12.8       codebook_0.9.2          Statamarkdown_0.4.5    
[52] devtools_2.3.0          usethis_1.6.1           sqldf_0.4-11           
[55] RSQLite_2.2.0           gsubfn_0.7              proto_1.0.0            
[58] broom_0.7.0             zoo_1.8-8               altair_4.0.1           
[61] rbokeh_0.5.1            janitor_2.0.1           plotly_4.9.2.1         
[64] kableExtra_1.1.0        Hmisc_4.4-0             Formula_1.2-3          
[67] survival_3.2-3          lattice_0.20-41         ggplot2_3.3.2          
[70] stringr_1.4.0           stringi_1.4.6           tidyr_1.1.0            
[73] knitr_1.29              matrixStats_0.56.0      boot_1.3-25            
[76] DiagrammeR_1.0.6.1.9000

loaded via a namespace (and not attached):
  [1] estimability_1.3    rappdirs_0.3.1      coda_0.19-3        
  [4] acepack_1.4.1       bit64_0.9-7         multcomp_1.4-13    
  [7] rpart_4.1-15        generics_0.0.2      callr_3.4.3        
 [10] TH.data_1.0-10      mice_3.9.0          ggfittext_0.9.0    
 [13] chron_2.3-55        bit_1.1-15.2        webshot_0.5.2      
 [16] xml2_1.3.2          lubridate_1.7.9     assertthat_0.2.1   
 [19] xfun_0.15           hms_0.5.3           evaluate_0.14      
 [22] fansi_0.4.1         dbplyr_1.4.4        readxl_1.3.1       
 [25] km.ci_0.5-2         DBI_1.1.0           htmlwidgets_1.5.1  
 [28] jsonvalidate_1.1.0  ellipsis_0.3.1      crosstalk_1.1.0.1  
 [31] backports_1.1.8     V8_3.1.0            insight_0.8.4      
 [34] survey_4.0          ggcorrplot_0.1.3    vctrs_0.3.1        
 [37] remotes_2.1.1       sjlabelled_1.1.5    abind_1.4-5        
 [40] withr_2.2.0         pryr_0.1.4          tigris_0.9.4       
 [43] checkmate_2.0.0     rgdal_1.5-8         ggmap_3.0.0        
 [46] prettyunits_1.1.1   cluster_2.1.0       lazyeval_0.2.2     
 [49] crayon_1.3.4        crul_0.9.0          labeling_0.3       
 [52] pkgconfig_2.0.3     units_0.6-6         nlme_3.1-148       
 [55] pkgload_1.1.0       nnet_7.3-14         rlang_0.4.6        
 [58] RJSONIO_1.3-1.4     lifecycle_0.2.0     sandwich_2.5-1     
 [61] httpcode_0.3.0      modelr_0.1.8        cellranger_1.1.0   
 [64] tcltk_4.0.2         rprojroot_1.3-2     KMsurv_0.1-5       
 [67] carData_3.0-4       reprex_0.3.0        base64enc_0.1-3    
 [70] processx_3.4.2      png_0.1-7           viridisLite_0.3.0  
 [73] rjson_0.2.20        parameters_0.7.0    bitops_1.0-6       
 [76] KernSmooth_2.23-17  visNetwork_2.0.9    pander_0.6.3       
 [79] blob_1.2.1          classInt_0.4-3      maptools_1.0-1     
 [82] jpeg_0.1-8.1        rstatix_0.5.0       ggeffects_0.14.3   
 [85] ggsignif_0.6.0      scales_1.1.1        memoise_1.1.0      
 [88] magrittr_1.5        plyr_1.8.6          hexbin_1.28.1      
 [91] compiler_4.0.2      snakecase_0.11.0    cli_2.0.2          
 [94] ps_1.3.3            htmlTable_2.0.0     tidyselect_1.1.0   
 [97] highr_0.8           mitools_2.4         jtools_2.0.5       
[100] yaml_2.2.1          latticeExtra_0.6-29 grid_4.0.2         
[103] tools_4.0.2         rmapshaper_0.4.4    rio_0.5.16         
[106] RgoogleMaps_1.4.5.3 rstudioapi_0.11     uuid_0.1-4         
[109] gistr_0.5.0         farver_2.0.3        sjPlot_2.8.4       
[112] digest_0.6.25       geojsonlint_0.4.0   car_3.0-8          
[115] performance_0.4.6   httr_1.4.2          gdtools_0.2.2      
[118] WDI_2.6.0           effectsize_0.3.1    sjstats_0.18.0     
[121] colorspace_1.4-1    rvest_0.3.5         fs_1.4.2           
[124] reticulate_1.16     splines_4.0.2       statmod_1.4.34     
[127] sp_1.4-2            vegawidget_0.3.1    sessioninfo_1.1.1  
[130] systemfonts_0.2.3   xtable_1.8-4        jsonlite_1.7.0     
[133] nloptr_1.2.2.1      testthat_2.3.2      R6_2.4.1           
[136] pillar_1.4.6        htmltools_0.5.0     glue_1.4.1         
[139] minqa_1.2.4         class_7.3-17        codetools_0.2-16   
[142] maps_3.3.0          pkgbuild_1.1.0      mvtnorm_1.1-1      
[145] curl_4.3            BiasedUrn_1.07      zip_2.0.4          
[148] openxlsx_4.1.5      rmarkdown_2.5       repr_1.1.0         
[151] desc_1.2.0          munsell_0.5.0       e1071_1.7-3        
[154] labelled_2.5.0      sjmisc_2.8.5        haven_2.3.1        
[157] gtable_0.3.0        bayestestR_0.6.0